Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into reshape_op_dev

02da0d1b · Yibing Liu · 899c7d6b · 0be34949 · 02da0d1b · 02da0d1b
90 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,6 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
-    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@@ -12,7 +11,6 @@ os:
 env:
  - JOB=build_doc
  - JOB=check_style
-  - JOB=build_android
 addons:
  apt:
    packages:
@@ -23,7 +21,6 @@ addons:
      - python
      - python-pip
      - python2.7-dev
-      - python-numpy
      - python-wheel
      - libboost-dev
      - curl
@@ -37,8 +34,8 @@ before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
-  - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
+  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - go get -u github.com/alecthomas/gometalinter

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,8 +65,8 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 if(ANDROID)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
    endif()
    set(WITH_GPU OFF CACHE STRING

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+# ENV variables
+ARG ANDROID_ABI
+ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
 ENV HOME=/root \
    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+    ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
+    ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
 RUN apt-get update && \
    apt-get install -y \
@@ -15,12 +21,11 @@ RUN apt-get update && \
    apt-get clean -y
 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -C /usr/local -xzf go.tgz && \
+    tar -xz -C /usr/local && \
    mkdir /root/gopath && \
    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
+    mkdir /root/gopath/src
-    rm go.tgz
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
@@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
    rm -rf /opt/android-ndk-tmp && \
    rm -rf ${ANDROID_NDK_HOME}

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -20,6 +20,7 @@
 # The supported variables are listed belows:
 # 
 # ANDROID_STANDALONE_TOOLCHAIN
+# ANDROID_TOOLCHAIN
 # ANDROID_ABI
 # ANDROID_NATIVE_API_LEVEL
 # ANDROID_ARM_MODE
@@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
    ENDIF()
 ENDIF()
+IF(NOT DEFINED ANDROID_TOOLCHAIN)
+    SET(ANDROID_TOOLCHAIN clang)
+ENDIF()
 IF(NOT DEFINED ANDROID_ABI)
    SET(ANDROID_ABI "armeabi-v7a")
 ENDIF()
@@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            "${CMAKE_VERSION}), when cross-compiling for Android.")
    IF(ANDROID_STANDALONE_TOOLCHAIN)
+        # Use standalone toolchain
        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
        IF(NOT CMAKE_SYSTEM_VERSION)
@@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
        # Toolchain
-        SET(ANDROID_TOOLCHAIN "gcc")
        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-        IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+    ELSE(ANDROID_NDK)
-            SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+        # TODO: use android ndk
-            IF(ANDROID_ABI STREQUAL "armeabi")
+    ENDIF()
-                SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-            ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-            ENDIF()
+        IF(ANDROID_ABI STREQUAL "armeabi")
-        ENDIF()
+            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-        IF(ANDROID_ABI STREQUAL "arm64-v8a")
+            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
-            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
        ENDIF()
-        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
+    ENDIF()
+    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
+        SET(ANDROID_C_COMPILER_NAME clang)
+        SET(ANDROID_CXX_COMPILER_NAME clang++)
+        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
+        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
+    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
+        SET(ANDROID_C_COMPILER_NAME gcc)
+        SET(ANDROID_CXX_COMPILER_NAME g++)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
    ENDIF()
    # C compiler
    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
    ENDIF()
@@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    # CXX compiler
    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
    ENDIF()
@@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
    IF(ANDROID_ABI STREQUAL "armeabi")
@@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
             -march=armv5te
             -mtune=xscale
             -msoft-float)
-    ENDIF()
+    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
        LIST(APPEND ANDROID_COMPILER_FLAGS
             -march=armv7-a
             -mfloat-abi=softfp)
@@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
        ENDIF()
        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
    ENDIF()
    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
@@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ELSE()
            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
        ENDIF()
+        IF(ANDROID_TOOLCHAIN STREQUAL clang)
+            # Disable integrated-as for better compatibility.
+            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
+        ENDIF()
    ENDIF()
-    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        # CMake automatically forwards all compiler flags to the linker,
+        # and clang doesn't like having -Wa flags being used for linking.
+        # To prevent CMake from doing this would require meddling with
+        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
+        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
    ENDIF()
    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+IF(USE_EIGEN_FOR_BLAS)
+    return()
+ENDIF(USE_EIGEN_FOR_BLAS)
 INCLUDE(cblas)
 IF(NOT ${CBLAS_FOUND})

--- a/doc/design/functions_operators_layers.md
+++ b/doc/design/functions_operators_layers.md
@@ -86,12 +86,13 @@ def layer.fc(X):
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
-```
 | C++ functions/functors | mul          | add          |             |          |
+|------------------------|--------------|--------------|-------------|----------|
 | C++ operator class     | mulOp        | addOp        | FCOp        |          |
 | Python binding         | operator.mul | operator.add | operator.fc |          |
 | Python function        |              |              |             | layer.fc |
-```
 This is how we differentiate layer and operators in PaddlePaddle:

--- a/doc/design/ops/dist_train.md
+++ b/doc/design/ops/dist_train.md
+# Design Doc: Operation Graph Based Parameter Server
+## Abstract
+We propose an approach to implement the parameter server. In this
+approach, there is no fundamental difference between the trainer and
+the parameter server: they both run subgraphs, but subgraphs of
+different purposes.
+## Background
+The previous implementations of the parameter server does not run a
+subgraph. parameter initialization, optimizer computation, network
+communication and checkpointing are implemented twice on both the
+trainer and the parameter server.
+It would be great if we can write code once and use them on both the
+trainer and the parameter server: reduces code duplication and
+improves extensibility. Given that after the current refactor, we are
+representing everything as a computing graph on the
+trainer. Representing everything as a computing graph on the parameter
+server becomes a natural extension.
+## Design
+### Graph Converter
+The *graph converter* converts the user-defined operation (OP) graph
+into subgraphs to be scheduled on different nodes with the following
+steps:
+1. OP placement: the OPs will be placed on different nodes according
+   to heuristic that minimizes estimated total computation
+   time. Currently we will use a simple heuristic that puts parameter
+   varable on parameter server workers and everything else on trainer
+   workers.
+1. Add communication OPs to enable the communication between nodes.
+We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
+Below is an example of converting the user defined graph to the
+subgraphs for the trainer and the parameter server:
+<img src="src/local-graph.png" width="300"/>
+After converting:
+<img src="src/dist-graph.png" width="700"/>
+1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
+1. Operators are added to the subgraphs.
+   - *Send* sends data to the connected *Recv* operator.  The
+	 scheduler on the receive node will only schedule *Recv* operator
+	 to run when the *Send* operator has ran (the *Send* OP will mark
+	 the *Recv* OP runnable automatically).
+   - *Enueue* enqueues the input variable, it can block until space
+     become available in the queue.
+   - *Dequeue* outputs configurable numbers of tensors from the
+     queue. It will block until the queue have the required number of
+     tensors.
+### Benefits
+- Model parallelism become easier to implement: it's an extension to
+  the trainer - parameter server approach. we already have the
+  communication OPs, but need to extend the graph converter's
+  placement functionality.
+- User-defined optimizer is easier to add - user can now express it as
+  a subgraph.
+- No more duplication logic inside the trainer and the parameter
+  server mentioned in the background section.
+### Challenges
+- It might be hard for the graph converter to cut a general graph
+  (without any hint for which subgraph is the optimizer). We may need
+  to label which subgraph inside the OP graph is the optimizer.
+- It's important to balance the parameter shards of on multiple
+  parameter server. If a single parameter is very big (some
+  word-embedding, fully connected, softmax layer), we need to
+  automatically partition the single parameter onto different
+  parameter servers when possible (only element-wise optimizer depends
+  on the parameter variable).
+### Discussion
+- In the "Aync SGD" figure, the "W" variable on the parameter server
+  could be read and wrote concurrently, what is our locking strategy?
+  E.g., each variable have a lock cpp method to be invoked by every
+  OP, or, have a lock OP.
+- Can the Enqueue OP be implemented under our current tensor design
+  (puts the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depends on the
+  `min_count` attribute), does our current design support it? (similar
+  question for the *Add* OP)
+### References:
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
--- a/doc/design/ops/src/dist-graph.graffle
+++ b/doc/design/ops/src/dist-graph.graffle
--- a/doc/design/ops/src/dist-graph.png
+++ b/doc/design/ops/src/dist-graph.png
--- a/doc/design/ops/src/local-graph.graffle
+++ b/doc/design/ops/src/local-graph.graffle
--- a/doc/design/ops/src/local-graph.png
+++ b/doc/design/ops/src/local-graph.png
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
+## Background
+PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
+PaddlePaddle use proto message to describe compile time graph for
+1. Computation graph should be able to be saved to a file.
+1. In distributed training, the graph will be serialized and send to multiple workers.
+The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
+| |compile time|runtime|
+|---|---|---|
+|Data|VarDesc(proto)|Variable(cpp)|
+|Operation|OpDesc(proto)|Operator(cpp)|
+## Definition of VarDesc
+A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
+```proto
+message VarDesc {
+  required string name = 1;
+  optional LoDTensorDesc lod_tensor = 2;
+}
+```
+## Definition of LodTensorDesc
+```proto
+enum DataType {
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+}
+message LoDTensorDesc {
+  required DataType data_type = 1;
+  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  optional int32 lod_level = 3 [default=0];
+}
+```
+## Definition of Variable in Python
+In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
+```python
+image = Variable(dims=[-1, 640, 480])
+# fc1 and fc2 are both Variable
+fc1 = layer.fc(input=image, output_size=10)
+fc2 = layer.fc(input=fc1, output_size=20)
+```
+### what should class `Variable` Have
+1. `name`.a name of string type is used to mark the value of the Variable.
+1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
+1. `operator`. Variable should record which operator produce itself. The reaon is:
+  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
+In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
+```python
+import VarDesc
+import LoDTensorDesc
+import framework
+def AddInitialOperator(variable, initializer):
+	# add an initialize Operator to block to init this Variable
+class Variable(object):
+   def __init__(self, name, dims, type, initializer):
+      self._block = get_default_block()
+      self._name = name
+      self.op = None
+      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
+      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
+      self._var = framework.CreateVar(_var_desc)
+      self._block.add_var(self)
+      # add initial op according to initializer
+      if initializer is not None:
+          AddInitialOperator(self, initializer)
+   def dims(self):
+      return self._var.dims()
+   def data_type(self):
+       return self._var.data_type()
+   def to_proto(self):
+       pass
+```
+Then we can use this Variable to create a fc layer in Python.
+```python
+import paddle as pd
+def flatten_size(X, num_flatten_dims):
+  prod = 1 # of last num_flatten_dims
+  for i in xrange(num_flatten_dims):
+    prod = prod * X.dims[-i-1]
+  return prod
+def layer.fc(X, output_size, num_flatten_dims):
+  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
+  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
+  out = Variable(type=FP32)
+  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
+  pd.InferShape(y)
+  return out
+x = Variable(dims=[-1, 640, 480])
+y = layer.fc(x, output_size=100)
+z = layer.fc(y, output_size=200)
+paddle.eval(targets=[z], ...)
+print(z)
+```
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -18,14 +18,6 @@ limitations under the License. */
 #ifndef __NVCC__
-#include "paddle/math/MathFunctions.h"
-#ifndef PADDLE_TYPE_DOUBLE
-#define     CBLAS_GEMM     paddle::gemm<float>
-#else
-#define     CBLAS_GEMM     paddle::gemm<double>
-#endif
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
                                       real *gateValue,
@@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
  }
 }
-template<class OpResetOutput, class OpFinalOutput>
-void hl_cpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               2 * frameSize,
-               frameSize,
-               1,
-               value.prevOutValue,
-               frameSize,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               value.gateValue,
-               frameSize * 3);
-  }
-  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
-  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               value.resetOutputValue,
-               frameSize,
-               value.stateWeight,
-               frameSize,
-               1,
-               value.gateValue + frameSize * 2,
-               frameSize * 3);
-  }
-  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
-}
 template<class OpStateGrad>
 void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
                                      real *gateValue,
@@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
  }
 }
-template<class OpStateGrad, class OpResetGrad>
-void hl_cpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  backward_state_grad(opStateGrad, value, grad,
-    frameSize, batchSize, active_node);
-  if (value.prevOutValue && grad.prevOutGrad) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               grad.gateGrad + frameSize * 2,
-               frameSize * 3,
-               value.stateWeight,
-               frameSize,
-               0,
-               grad.resetOutputGrad,
-               frameSize);
-    if (grad.stateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize,
-                 batchSize,
-                 1,
-                 value.resetOutputValue,
-                 frameSize,
-                 grad.gateGrad + frameSize * 2,
-                 frameSize * 3,
-                 1,
-                 grad.stateWeightGrad,
-                 frameSize);
-    }
-  }
-  backward_reset_grad(opResetGrad, value, grad,
-    frameSize, batchSize, active_gate);
-  if (grad.prevOutGrad && value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize * 2,
-               1,
-               grad.gateGrad,
-               frameSize * 3,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               grad.prevOutGrad,
-               frameSize);
-    if (grad.gateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize * 2,
-                 batchSize,
-                 1,
-                 value.prevOutValue,
-                 frameSize,
-                 grad.gateGrad,
-                 frameSize * 3,
-                 1,
-                 grad.gateWeightGrad,
-                 frameSize * 2);
-    }
-  }
-}
 #endif
 #endif  // HL_CPU_GRU_CUH_
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -45,7 +45,19 @@ class GreaterThanChecker {
 public:
  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
-    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
+  }
+ private:
+  T lower_bound_;
+};
+template <typename T>
+class EqualGreaterThanChecker {
+ public:
+  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
  }
 private:
@@ -115,6 +127,11 @@ class TypedAttrChecker {
    return *this;
  }
+  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
+    return *this;
+  }
  // we can add more common limits, like LessThan(), Between()...
  TypedAttrChecker& SetDefault(const T& default_value) {

--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@@ -2,20 +2,20 @@
 ## Motivation
-In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the gradient operators/expressions together with the chain rule. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
 ## Backward Operator Registry
-A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
+A backward network is built up with several backward operators. Backward operators take forward operators' inputs outputs, and output gradients and then calculate its input gradients.
 |                        | forward operator | backward operator 
 | ---------------------- | ---------------- |------------------------- |		
 | **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
 | **Operator::outputs_** | Outputs          | InputGradients            |
- In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
+ In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
-For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
+For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro:
 ```cpp
 REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
@@ -27,17 +27,17 @@ REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
 ## Backward Opeartor Creating
-Given a certain forward operator, we can get its corresponding backward opeartor by calling:
+Given a certain forward operator, we can get its corresponding backward operator by calling:
 ```cpp
 OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
-``` 
+```
 The function `BuildGradOp` will sequentially execute following processes:
 1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
-2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
+2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
 3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
@@ -49,31 +49,31 @@ A backward network is a series of backward operators. The main idea of building
 In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. 
-given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
+given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`, `InputGradients`.
 1. Op 
-   when the input forward network is a Op, return its gradient Operator Immediately.
+   when the input forward network is an Op, return its gradient Operator Immediately.
 2. NetOp 
-   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
+   when the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
-   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.  
+   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwrite their shared input variable.  
   <p align="center">
-   <img src="./images/duplicate_op.png" width="70%" ><br/>
+   <img src="./images/duplicate_op.png" width="50%" ><br/>
-   1. shared variable in two operators. 
+   1. Shared variable in operators. 
   </p>
-   Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. 
+   Share variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator replace the overwrite links. 
   <p align="center">
-   <img src="images/duplicate_op2.png" width="90%" ><br/>
+   <img src="images/duplicate_op2.png" width="50%" ><br/>
-   2. replace shared variable gradient with `Add` Operator
+   2. Replace shared variable's gradient with `Add` operator.
   </p>

--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -283,5 +283,14 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 DDim::DDim(std::initializer_list<int64_t> init_list) {
  *this = make_ddim(init_list);
 }
+DDim flatten_to_2d(const DDim& src, int num_col_dims) {
+  int rank = src.size();
+  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
+                    product(slice_ddim(src, num_col_dims, rank))});
+}
+DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -115,6 +115,12 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
+// Reshape a tensor to a matrix. The matrix's first dimension(column length)
+// will be the product of tensor's first `num_col_dims` dimensions.
+DDim flatten_to_2d(const DDim& src, int num_col_dims);
+DDim flatten_to_1d(const DDim& src);
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -63,20 +63,35 @@ struct EigenTensor {
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
-struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
+                                                 int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+};
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
  // Flatten reshapes a Tensor into an EigenVector.
  static typename EigenVector::Type Flatten(Tensor& tensor) {
-    return EigenVector::From(
+    return EigenVector::From(tensor, {product(tensor.dims_)});
-        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
-    return EigenVector::From(
+    return EigenVector::From(tensor, {product(tensor.dims_)});
-        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
 };

--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
@@ -108,5 +108,24 @@ TEST(Eigen, Matrix) {
  }
 }
+TEST(Eigen, MatrixReshape) {
+  Tensor t;
+  float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
+    p[i] = static_cast<float>(i);
+  }
+  EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
+  ASSERT_EQ(2 * 3, em.dimension(0));
+  ASSERT_EQ(6 * 4, em.dimension(1));
+  for (int i = 0; i < 2 * 3; i++) {
+    for (int j = 0; j < 6 * 4; j++) {
+      ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/images/duplicate_op2.graffle
+++ b/paddle/framework/images/duplicate_op2.graffle
--- a/paddle/framework/images/duplicate_op2.png
+++ b/paddle/framework/images/duplicate_op2.png
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -43,6 +43,9 @@ class Tensor {
  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;
+  template <typename T, int MajorType, typename IndexType>
+  friend struct EigenMatrix;
  template <typename T, int MajorType, typename IndexType>
  friend struct EigenVector;

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -148,5 +148,13 @@ inline Tensor& Tensor::Resize(const DDim& dims) {
 inline const DDim& Tensor::dims() const { return dims_; }
+template <typename T>
+inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
+  Tensor res;
+  res.ShareDataWith<T>(src);
+  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
+  return res;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -262,3 +262,16 @@ TEST(Tensor, CopyFrom) {
  }
 #endif
 }
+TEST(Tensor, ReshapeToMatrix) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
+  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
+    src_ptr[i] = i;
+  }
+  Tensor res = ReshapeToMatrix<int>(src, 2);
+  ASSERT_EQ(res.dims()[0], 2 * 3);
+  ASSERT_EQ(res.dims()[1], 4 * 9);
+}
\ No newline at end of file
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -44,6 +44,7 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
+    add_simple_unittest(SwitchOpTest)
 endif()
 add_simple_unittest(Im2ColTest)

--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -83,9 +83,9 @@ struct EigenBlasGemm {
 };
 #ifdef PADDLE_TYPE_DOUBLE
-template class EigenBlasGemm<double>;
+template struct EigenBlasGemm<double>;
 #else
-template class EigenBlasGemm<float>;
+template struct EigenBlasGemm<float>;
 #endif
 }  // namespace paddle
--- a/paddle/function/GruFunctor.h
+++ b/paddle/function/GruFunctor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "GemmFunctor.h"
+#include "hl_cpu_gru.cuh"
+namespace paddle {
+template <DeviceType Device, class T>
+struct GruFunctor {
+  template <class OpResetOutput, class OpFinalOutput>
+  static void compute(OpResetOutput opResetOutput,
+                      OpFinalOutput opFinalOutput,
+                      hl_gru_value value,
+                      int frameSize,
+                      int batchSize,
+                      hl_activation_mode_t active_node,
+                      hl_activation_mode_t active_gate) {
+#ifndef __NVCC__
+    if (value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   false,
+                                   batchSize,
+                                   2 * frameSize,
+                                   frameSize,
+                                   1,
+                                   value.prevOutValue,
+                                   frameSize,
+                                   value.gateWeight,
+                                   frameSize * 2,
+                                   1,
+                                   value.gateValue,
+                                   frameSize * 3);
+    }
+    forward_reset_output(
+        opResetOutput, value, frameSize, batchSize, active_gate);
+    if (value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   false,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize,
+                                   1,
+                                   value.resetOutputValue,
+                                   frameSize,
+                                   value.stateWeight,
+                                   frameSize,
+                                   1,
+                                   value.gateValue + frameSize * 2,
+                                   frameSize * 3);
+    }
+    forward_final_output(
+        opFinalOutput, value, frameSize, batchSize, active_node);
+#endif
+  }
+};
+template <DeviceType Device, class T>
+struct GruGradFunctor {
+  template <class OpStateGrad, class OpResetGrad>
+  static void compute(OpStateGrad opStateGrad,
+                      OpResetGrad opResetGrad,
+                      hl_gru_value value,
+                      hl_gru_grad grad,
+                      int frameSize,
+                      int batchSize,
+                      hl_activation_mode_t active_node,
+                      hl_activation_mode_t active_gate) {
+#ifndef __NVCC__
+    backward_state_grad(
+        opStateGrad, value, grad, frameSize, batchSize, active_node);
+    if (value.prevOutValue && grad.prevOutGrad) {
+      BlasGemm<Device, T>::compute(false,
+                                   true,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize,
+                                   1,
+                                   grad.gateGrad + frameSize * 2,
+                                   frameSize * 3,
+                                   value.stateWeight,
+                                   frameSize,
+                                   0,
+                                   grad.resetOutputGrad,
+                                   frameSize);
+      if (grad.stateWeightGrad) {
+        BlasGemm<Device, T>::compute(true,
+                                     false,
+                                     frameSize,
+                                     frameSize,
+                                     batchSize,
+                                     1,
+                                     value.resetOutputValue,
+                                     frameSize,
+                                     grad.gateGrad + frameSize * 2,
+                                     frameSize * 3,
+                                     1,
+                                     grad.stateWeightGrad,
+                                     frameSize);
+      }
+    }
+    backward_reset_grad(
+        opResetGrad, value, grad, frameSize, batchSize, active_gate);
+    if (grad.prevOutGrad && value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   true,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize * 2,
+                                   1,
+                                   grad.gateGrad,
+                                   frameSize * 3,
+                                   value.gateWeight,
+                                   frameSize * 2,
+                                   1,
+                                   grad.prevOutGrad,
+                                   frameSize);
+      if (grad.gateWeightGrad) {
+        BlasGemm<Device, T>::compute(true,
+                                     false,
+                                     frameSize,
+                                     frameSize * 2,
+                                     batchSize,
+                                     1,
+                                     value.prevOutValue,
+                                     frameSize,
+                                     grad.gateGrad,
+                                     frameSize * 3,
+                                     1,
+                                     grad.gateWeightGrad,
+                                     frameSize * 2);
+      }
+    }
+#endif
+  }
+};
+}  // namespace paddle
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -94,95 +94,4 @@ public:
                  int paddingWidth);
 };
-template <class T>
-struct Padding {
-  static void run(const T* src,
-                  T* dest,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int paddingHeight,
-                  int paddingWidth) {
-    const int destWidth = inputWidth + 2 * paddingWidth;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
-        dest += destWidth * paddingHeight;
-      }
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = T(0);
-        }
-        memcpy(dest, src, inputWidth * sizeof(T));
-        dest += inputWidth;
-        src += inputWidth;
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = T(0);
-        }
-      }
-      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
-        dest += destWidth * paddingHeight;
-      }
-    }
-  }
-};
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <>
-struct Padding<float> {
-  static void run(const float* src,
-                  float* dest,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int paddingHeight,
-                  int paddingWidth) {
-    const int destWidth = inputWidth + 2 * paddingWidth;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
-        dest += destWidth * paddingHeight;
-      }
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = float(0);
-        }
-        int step = inputWidth >> 2;
-        int remain = inputWidth & 3;
-        for (int s = 0; s < step; s++) {
-          float32x4_t s0 = vld1q_f32(src);
-          vst1q_f32(dest, s0);
-          src += 4;
-          dest += 4;
-        }
-        for (int r = 0; r < remain; r++) {
-          *dest++ = *src++;
-        }
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = float(0);
-        }
-      }
-      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
-        dest += destWidth * paddingHeight;
-      }
-    }
-  }
-};
-#endif
 }  // namespace paddle
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "MulOp.h"
-/// todo(tianbing), delete it
+#include "GemmFunctor.h"
-#include <iostream>
-#include "paddle/math/MathFunctions.h"
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/ThreadLocal.h"
-#ifndef PADDLE_TYPE_DOUBLE
-#define GEMM paddle::gemm<float>
-#else
-#define GEMM paddle::gemm<double>
-#endif
 namespace {
 inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
  for (unsigned int i = 0; i < len; ++i) {
@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                            real scaleT,
                            bool aTrans,
                            bool bTrans) {
-  GEMM(aTrans ? CblasTrans : CblasNoTrans,
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-       bTrans ? CblasTrans : CblasNoTrans,
+      aTrans,
-       out.getHeight(),
+      bTrans,
-       out.getWidth(),
+      out.getHeight(),
-       !aTrans ? a.getWidth() : a.getHeight(),
+      out.getWidth(),
-       scaleAB,
+      !aTrans ? a.getWidth() : a.getHeight(),
-       a.getData(),
+      scaleAB,
-       a.getStride(),
+      a.getData(),
-       b.getData(),
+      a.getStride(),
-       b.getStride(),
+      b.getData(),
-       scaleT,
+      b.getStride(),
-       out.getData(),
+      scaleT,
-       out.getStride());
+      out.getData(),
+      out.getStride());
 }
 /// dense matrix (+)= sparse matrix * dense matrix

--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "SwitchOp.h"
+#include "paddle/math/Vector.h"
+namespace paddle {
+template <>
+void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inC,
+                                const int inH,
+                                const int inW,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < inC; ++c) {
+      for (int h = 0; h < inH; ++h) {
+        for (int w = 0; w < inW; ++w) {
+          if (argType == ADD_TO) {
+            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
+          } else {
+            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+template <>
+void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inH,
+                                const int inW,
+                                const int inC,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int h = 0; h < inH; ++h) {
+      for (int w = 0; w < inW; ++w) {
+        for (int c = 0; c < inC; ++c) {
+          if (argType == ADD_TO) {
+            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
+          } else {
+            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size,channels, height, width' to
+ *         order 'batch_size, height, width, channels'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size,channels, height, width'.
+ * \param outputs output data with order 'batch_size, height, width, channels'.
+ */
+template <DeviceType Device>
+class NCHW2NHWCFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    NCHW2NHWC<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inC,
+                      inH,
+                      inW,
+                      outputs[0].getArgType());
+  }
+};
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size, height, width, channels' to
+ *         order 'batch_size, channels, height, width'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size, height, width, channels'.
+ * \param outputs output data with order 'batch_size, channels, height, width'.
+ */
+template <DeviceType Device>
+class NHWC2NCHWFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    size_t num = inputs[0].shape()[0];
+    size_t inH = inputs[0].shape()[1];
+    size_t inW = inputs[0].shape()[2];
+    size_t inC = inputs[0].shape()[3];
+    NHWC2NCHW<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inH,
+                      inW,
+                      inC,
+                      outputs[0].getArgType());
+  }
+};
+REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
+#endif
+}  // namespace paddle
--- a/paddle/function/SwitchOp.h
+++ b/paddle/function/SwitchOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Function.h"
+namespace paddle {
+/**
+ * \brief  This funtion switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order 'batch_size,
+ *channels, height, width' to
+ *         order 'batch_size, height, width, channels'.
+ *
+ * \param[out] outputs save results.
+ * \param[in]  inputs  input data.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  argType     type of output argument.
+ */
+template <DeviceType Device>
+void NCHW2NHWC(real* outputs,
+               const real* inputs,
+               const int num,
+               const int inC,
+               const int inH,
+               const int inW,
+               const int argtype);
+/**
+ * \brief  This funtion switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order 'batch_size,
+ *height, width, channels' to
+ *         order 'batch_size, channels, height, width'.
+ *
+ * \param[out] inGrad  gradients of previous layer.
+ * \param[in]  outGrad output gradients.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inW     with of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  argType     type of output argument.
+ */
+template <DeviceType Device>
+void NHWC2NCHW(real* inGrad,
+               const real* outGrad,
+               const int num,
+               const int inH,
+               const int inW,
+               const int inC,
+               const int argType);
+}  // namespace paddle
--- a/paddle/function/SwitchOpGpu.cu
+++ b/paddle/function/SwitchOpGpu.cu
+/* Copyright (c) 2016 Paddle
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "SwitchOp.h"
+#include "hl_base.h"
+namespace paddle {
+__global__ void KeNCHW2NHWC(real* outputs,
+                            const real* inputs,
+                            int inC,
+                            int inH,
+                            int inW,
+                            int nthreads,
+                            int argType) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+    const int off = ((n * inH + h) * inW + w) * inC + c;
+    if (argType == ADD_TO) {
+      outputs[off] += inputs[idx];
+    } else {
+      outputs[off] = inputs[idx];
+    }
+  }
+}
+template <>
+void NCHW2NHWC<DEVICE_TYPE_GPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inC,
+                                const int inH,
+                                const int inW,
+                                const int argType) {
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  KeNCHW2NHWC<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, inC, inH, inW, nth, argType);
+  CHECK_SYNC("NCHW2NHWC");
+}
+__global__ void KeNHWC2NCHW(real* outputs,
+                            const real* inputs,
+                            int inH,
+                            int inW,
+                            int inC,
+                            int nthreads,
+                            int argType) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int c = idx % inC;
+    const int w = (idx / inC) % inW;
+    const int h = (idx / inC / inW) % inH;
+    const int n = idx / inW / inH / inC;
+    const int off = ((n * inC + c) * inH + h) * inW + w;
+    if (argType == ADD_TO) {
+      outputs[off] += inputs[idx];
+    } else {
+      outputs[off] = inputs[idx];
+    }
+  }
+}
+template <>
+void NHWC2NCHW<DEVICE_TYPE_GPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inH,
+                                const int inW,
+                                const int inC,
+                                const int argType) {
+  int nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  KeNHWC2NCHW<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, inH, inW, inC, nth, argType);
+  CHECK_SYNC("NHWC2NCHW");
+}
+}  // namespace paddle
--- a/paddle/function/SwitchOpTest.cpp
+++ b/paddle/function/SwitchOpTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+namespace paddle {
+TEST(Pad, real) {
+  for (size_t numSamples : {1, 4, 8, 16}) {
+    for (size_t channels : {1, 4, 8, 16}) {
+      for (size_t imgSizeH : {1, 4, 8, 16}) {
+        for (size_t imgSizeW : {1, 4, 8, 16}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          for (bool test_grad : {true, false}) {
+            CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC",
+                                      FuncConfig());
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(
+                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
+            compare.run();
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "NeonDepthwiseConv.h"
+#include "paddle/function/ConvOp.h"
+namespace paddle {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <DeviceType Device>
+class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(paddingH(), paddingW());
+    CHECK_EQ(filterHeight, filterWidth);
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+    // padding the input, input -> inputPadding
+    float* inputPadding = inputData;
+    int padInputHeight =
+        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
+    int padInputWidth =
+        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
+    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      if (strideH() == 1) {
+        neon::Padding<float>::run(inputData,
+                                  inputPadding,
+                                  batchSize * inputChannels,
+                                  inputHeight,
+                                  inputWidth,
+                                  padInputHeight,
+                                  padInputWidth);
+      } else if (strideH() == 2) {
+        neon::StridePadding::run(inputData,
+                                 inputPadding,
+                                 batchSize * inputChannels,
+                                 inputHeight,
+                                 inputWidth,
+                                 padInputHeight,
+                                 padInputWidth);
+      } else {
+        LOG(FATAL) << "Not supported";
+      }
+    }
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+    if (filterWidth == 3) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 4) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+#ifndef PADDLE_TYPE_DOUBLE
+REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
+                    CPU,
+                    NeonDepthwiseConvTransposeFunction);
+#endif
+#endif
+}  // namespace paddle
--- a/paddle/function/neon/neon_util.h
+++ b/paddle/function/neon/neon_util.h
@@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) {
  return vget_lane_f32(vpadd_f32(v, v), 0);
 }
-inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
+#define vmlaq_laneq_f32(a, b, v, lane) \
-                                   float32x4_t b,
+  vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
-                                   float32x4_t v,
-                                   const int lane) {
-  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
-}
 #endif
 }  // namespace neon

--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -62,14 +62,18 @@ void BatchNormBaseLayer::calFeatureMapSize() {
  const ImageConfig& conf = config_.inputs(0).image_conf();
  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
+  if (0 == imageD_) imageD_ = conf.img_size_z();
  if (imageH_ == 0 && imageW_ == 0) {
    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
    imageW_ = conf.img_size();
  } else {
    getOutput().setFrameHeight(imageH_);
    getOutput().setFrameWidth(imageW_);
+    getOutput().setFrameDepth(imageD_);
  }
-  imgPixels_ = imageH_ * imageW_;
+  imgPixels_ = imageH_ * imageW_ * imageD_;
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -80,6 +80,7 @@ protected:
  /// Height or width of input image feature.
  /// Both of them are 1 if the input is fully-connected layer.
+  int imageD_;
  int imageH_;
  int imageW_;
  /// Height * Width.

--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -37,7 +37,7 @@ bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
 }
 void CudnnBatchNormLayer::reshape(int batchSize) {
-  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_, imageW_);
+  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
 }
 void CudnnBatchNormLayer::forward(PassType passType) {
@@ -104,7 +104,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   EPS,
                                   batchSize,
                                   channels_,
-                                   imageH_,
+                                   imageH_ * imageD_,
                                   imageW_);
    }
  }

--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
+#include "paddle/function/GruFunctor.h"
 #include "paddle/utils/Util.h"
 namespace paddle {
@@ -25,13 +26,13 @@ void GruCompute::init(LayerConfig &config) {
 template <>
 void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  hl_cpu_gru_forward(hppl::forward::gru_resetOutput(),
+  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
-                     hppl::forward::gru_finalOutput(),
+                                             hppl::forward::gru_finalOutput(),
-                     value,
+                                             value,
-                     frameSize,
+                                             frameSize,
-                     batchSize,
+                                             batchSize,
-                     activeNode_,
+                                             activeNode_,
-                     activeGate_);
+                                             activeGate_);
 }
 template <>
@@ -39,14 +40,15 @@ void GruCompute::backward<0>(hl_gru_value value,
                             hl_gru_grad grad,
                             int frameSize,
                             int batchSize) {
-  hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
+  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
-                      hppl::backward::gru_resetGrad(),
+      hppl::backward::gru_stateGrad(),
-                      value,
+      hppl::backward::gru_resetGrad(),
-                      grad,
+      value,
-                      frameSize,
+      grad,
-                      batchSize,
+      frameSize,
-                      activeNode_,
+      batchSize,
-                      activeGate_);
+      activeNode_,
+      activeGate_);
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/SwitchOrderLayer.cpp
+++ b/paddle/gserver/layers/SwitchOrderLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "SwitchOrderLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+REGISTER_LAYER(switch_order, SwitchOrderLayer);
+bool SwitchOrderLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  auto& img_conf = config_.inputs(0).image_conf();
+  size_t inD = img_conf.img_size_z();
+  size_t inH =
+      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
+  size_t inW = img_conf.img_size();
+  size_t inC = img_conf.channels();
+  inH = inH * inD;
+  inDims_ = TensorShape({0, inC, inH, inW});
+  outDims_ = TensorShape(4);
+  auto& reshape_conf = config_.reshape_conf();
+  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
+    heightAxis_.push_back(reshape_conf.height_axis(i));
+  }
+  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
+    widthAxis_.push_back(reshape_conf.width_axis(i));
+  }
+  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
+  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
+  return true;
+}
+void SwitchOrderLayer::setOutDims() {
+  outDims_.setDim(0, inDims_[0]);
+  outDims_.setDim(1, inDims_[2]);
+  outDims_.setDim(2, inDims_[3]);
+  outDims_.setDim(3, inDims_[1]);
+  reshapeHeight_ = 1;
+  for (size_t i = 0; i < heightAxis_.size(); i++) {
+    reshapeHeight_ *= outDims_[heightAxis_[i]];
+  }
+  output_.setFrameHeight(reshapeHeight_);
+  reshapeWidth_ = 1;
+  for (size_t i = 0; i < widthAxis_.size(); i++) {
+    reshapeWidth_ *= outDims_[widthAxis_[i]];
+  }
+  output_.setFrameWidth(reshapeWidth_);
+}
+void SwitchOrderLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int d = inputLayers_[0]->getOutput().getFrameDepth();
+  d = (d == 0 ? 1 : d);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h * d);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  int totalCount = input->getElementCnt();
+  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
+  if (channels != 0) inDims_.setDim(1, channels);
+}
+void SwitchOrderLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
+  if (heightAxis_.size() > 0) {
+    getOutputValue()->reshape(reshapeHeight_, reshapeWidth_);
+    getOutputGrad()->reshape(reshapeHeight_, reshapeWidth_);
+  }
+  // switch NCHW to NHWC
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_);
+  nchw2nhwc_[0]->calc(inputs, outputs);
+  forwardActivation();
+}
+void SwitchOrderLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  backwardActivation();
+  // switch NHWC to NCHW
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  nhwc2nchw_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/SwitchOrderLayer.h
+++ b/paddle/gserver/layers/SwitchOrderLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+namespace paddle {
+/**
+ * \brief  This layer calculate softmax in image channel dimension.
+ */
+class SwitchOrderLayer : public Layer {
+public:
+  explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
+  ~SwitchOrderLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  void setInDims();
+  void setOutDims();
+protected:
+  std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
+  std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
+  TensorShape inDims_;
+  TensorShape outDims_;
+  std::vector<int> heightAxis_;
+  std::vector<int> widthAxis_;
+  size_t reshapeHeight_;
+  size_t reshapeWidth_;
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1703,6 +1703,55 @@ TEST(Layer, BatchNormalizationLayer) {
 #endif
 }
+void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  const int IMG_SIZE_Z = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+  img_conf->set_img_size_z(IMG_SIZE_Z);
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+TEST(Layer, testBatchNorm3DLayer) {
+  testBatchNorm3DLayer("batch_norm", false, false);
+#ifndef PADDLE_ONLY_CPU
+  testBatchNorm3DLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
 void testConvOperator(bool isDeconv) {
  TestConfig config;
  const int NUM_FILTERS = 16;
@@ -2008,6 +2057,31 @@ TEST(Layer, CropLayer) {
  }
 }
+TEST(Layer, SwitchOrderLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  img->set_img_size_y(16);
+  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
+  reshape->add_height_axis(0);
+  reshape->add_height_axis(1);
+  reshape->add_height_axis(2);
+  reshape->add_width_axis(3);
+  // config softmax layer
+  config.layerConfig.set_type("switch_order");
+  config.layerConfig.set_name("switchOrderLayer");
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
+  }
+}
 vector<real> randSampling(real range, int n) {
  CHECK_GE(range, n);
  vector<real> num(range);

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
 namespace paddle {
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void gemm<float>(const CBLAS_TRANSPOSE transA,
                 const CBLAS_TRANSPOSE transB,
@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
              C,
              ldc);
 }
+#endif
 template <>
 int getrf<float>(const CBLAS_ORDER order,
@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
 }
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
  cblas_saxpy(n, alpha, x, 1, y, 1);
@@ -201,6 +204,7 @@ template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
  return cblas_ddot(n, x, 1, y, 1);
 }
+#endif
 #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)

--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -40,7 +40,14 @@ extern "C" {
 #ifndef LAPACK_FOUND
 extern "C" {
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 #include <cblas.h>
+#else
+typedef enum CBLAS_ORDER {
+  CblasRowMajor = 101,
+  CblasColMajor = 102
+} CBLAS_ORDER;
+#endif
 int LAPACKE_sgetrf(
    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
 int LAPACKE_dgetrf(
@@ -56,6 +63,7 @@ int LAPACKE_dgetri(
 namespace paddle {
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <class T>
 void gemm(const CBLAS_TRANSPOSE transA,
          const CBLAS_TRANSPOSE transB,
@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
          const T beta,
          T* C,
          const int ldc);
+#endif
 template <class T>
 int getrf(const CBLAS_ORDER Order,
@@ -84,10 +93,21 @@ int getri(
    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
 template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y);
+void axpy(const int n, const T alpha, const T* x, T* y) {
+  /// y = y + alpha * x
+  for (int i = 0; i < n; i++) {
+    y[i] = y[i] + alpha * x[i];
+  }
+}
 template <class T>
-T dotProduct(const int n, const T* x, const T* y);
+T dotProduct(const int n, const T* x, const T* y) {
+  T result = static_cast<T>(0);
+  for (int i = 0; i < n; i++) {
+    result += x[i] * y[i];
+  }
+  return result;
+}
 template <class T>
 void vExp(const int n, const T* a, T* r);

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "SIMDFunctions.h"
@@ -2773,24 +2774,24 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
  CHECK(!isTransposed()) << "Not supported";
  size_t a_col, b_col, a_row, b_row;
-  CBLAS_TRANSPOSE a_trans, b_trans;
+  bool a_trans, b_trans;
  if (!a->isTransposed()) {
    a_col = a->getWidth();
    a_row = a->getHeight();
-    a_trans = CblasNoTrans;
+    a_trans = false;
  } else {
    a_col = a->getHeight();
    a_row = a->getWidth();
-    a_trans = CblasTrans;
+    a_trans = true;
  }
  if (!b->isTransposed()) {
    b_col = b->getWidth();
    b_row = b->getHeight();
-    b_trans = CblasNoTrans;
+    b_trans = false;
  } else {
    b_col = b->getHeight();
    b_row = b->getWidth();
-    b_trans = CblasTrans;
+    b_trans = true;
  }
  CHECK_EQ(a_col, b_row);
@@ -2807,7 +2808,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
  int lda = a->getStride();
  int ldb = b->getStride();
  int ldc = getStride();
-  gemm<real>(
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
 }

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1616,6 +1616,10 @@ public:
 };
 class CpuMatrix : public Matrix {
+private:
+  MatrixPtr sftmaxSum_;
+  MatrixPtr sftmaxDot_;
 public:
  CpuMatrix(size_t height, size_t width, bool trans = false);
  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)

--- a/paddle/operators/identity_op.cc
+++ b/paddle/operators/identity_op.cc
@@ -18,17 +18,20 @@
 namespace paddle {
 namespace operators {
-// identity is a alias of scale op. This is also a example for creating a alias
+// The identity operator is an alias of the scale operator. This is also an
-// operator.
+// example for creating an alias for an existing operator.
 template <typename AttrType>
 class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  IdentityOpMaker(framework::OpProto *proto,
                  framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input tensor of identity op");
+    AddInput("X", "The input tensor of identity operator.");
-    AddOutput("Out", "output tensor of identity op");
+    AddOutput("Out", "The output tensor of identity operator.");
-    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+    AddComment(R"DOC(
+The identity operator is an alias of the scale operator
+with the attribute scale fixed to 1.0.
+)DOC");
  }
 };

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu DEPS cblas device_context)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc 
+    im2col.cu DEPS cblas device_context)
 else()
-    cc_library(math_function SRCS math_function.cc DEPS cblas device_context)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
 endif()
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/im2col.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUPlace, T> {
+ public:
+  void operator()(const framework::Tensor& im, framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int output_height = col.dims()[3];
+    int output_width = col.dims()[4];
+    int channels_col = input_channels * filter_height * filter_width;
+    const T* im_data = im.data<T>();
+    T* col_data = col.data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / filter_width / filter_height;
+      for (int h = 0; h < output_height; ++h) {
+        for (int w = 0; w < output_width; ++w) {
+          int im_row_idx = h * stride_height + h_offset;
+          int im_col_idx = w * stride_width + w_offset;
+          if ((im_row_idx - padding_height) < 0 ||
+              (im_row_idx - padding_height) >= input_height ||
+              (im_col_idx - padding_width) < 0 ||
+              (im_col_idx - padding_width) >= input_width) {
+            col_data[(c * output_height + h) * output_width + w] = T(0);
+          } else {
+            im_row_idx += c_im * input_height - padding_height;
+            im_col_idx -= padding_width;
+            col_data[(c * output_height + h) * output_width + w] =
+                im_data[im_row_idx * input_width + im_col_idx];
+          }
+        }
+      }
+    }
+  }
+};
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUPlace, T> {
+ public:
+  void operator()(framework::Tensor& im, const framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int output_height = col.dims()[3];
+    int output_width = col.dims()[4];
+    int channels_col = input_channels * filter_height * filter_width;
+    T* im_data = im.data<T>();
+    const T* col_data = col.data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / filter_width / filter_height;
+      for (int h = 0; h < output_height; ++h) {
+        for (int w = 0; w < output_width; ++w) {
+          int im_row_idx = h * stride_height + h_offset;
+          int im_col_idx = w * stride_width + w_offset;
+          if ((im_row_idx - padding_height) >= 0 &&
+              (im_row_idx - padding_height) < input_height &&
+              (im_col_idx - padding_width) >= 0 &&
+              (im_col_idx - padding_width) < input_width) {
+            im_row_idx += c_im * input_height - padding_height;
+            im_col_idx -= padding_width;
+            im_data[im_row_idx * input_width + im_col_idx] +=
+                col_data[(c * output_height + h) * output_width + w];
+          }
+        }
+      }
+    }
+  }
+};
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUPlace, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUPlace, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUPlace, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUPlace, double>;
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUPlace, T> {
+ public:
+  void operator()(const framework::Tensor& im, framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int output_height = col.dims()[0];
+    int output_width = col.dims()[1];
+    const T* im_data = im.data<T>();
+    T* col_data = col.data<T>();
+    for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
+        for (int channel = 0; channel < input_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_row_offset =
+                  col_row_idx * stride_height + filter_row_idx - padding_height;
+              int im_col_offset =
+                  col_col_idx * stride_width + filter_col_idx - padding_width;
+              int col_offset = (((col_row_idx * output_width + col_col_idx) *
+                                     input_channels +
+                                 channel) *
+                                    filter_height +
+                                filter_row_idx) *
+                                   filter_width +
+                               filter_col_idx;
+              if (im_row_offset < 0 || im_row_offset >= input_height ||
+                  im_col_offset < 0 || im_col_offset >= input_width) {
+                col_data[col_offset] = T(0);
+              } else {
+                int im_offset =
+                    (channel * input_height + im_row_offset) * input_width +
+                    im_col_offset;
+                col_data[col_offset] = im_data[im_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUPlace, T> {
+ public:
+  void operator()(framework::Tensor& im, const framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int output_height = col.dims()[0];
+    int output_width = col.dims()[1];
+    T* im_data = im.data<T>();
+    const T* col_data = col.data<T>();
+    for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
+        for (int channel = 0; channel < input_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_row_offset =
+                  col_row_idx * stride_height + filter_row_idx - padding_height;
+              int im_col_offset =
+                  col_col_idx * stride_width + filter_col_idx - padding_width;
+              int col_offset = (((col_row_idx * output_width + col_col_idx) *
+                                     input_channels +
+                                 channel) *
+                                    filter_height +
+                                filter_row_idx) *
+                                   filter_width +
+                               filter_col_idx;
+              if (im_row_offset >= 0 && im_row_offset < input_height &&
+                  im_col_offset >= 0 && im_col_offset < input_width) {
+                int im_offset =
+                    (channel * input_height + im_row_offset) * input_width +
+                    im_col_offset;
+                im_data[im_offset] += col_data[col_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUPlace, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUPlace, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUPlace, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/im2col.h"
+#include "paddle/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <class T>
+__global__ void im2col(const T* data_im, int num_outs, int height, int width,
+                       int filter_height, int filter_width, int stride_height,
+                       int stride_width, int padding_height, int padding_width,
+                       int output_height, int output_width, T* data_col) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < num_outs) {
+    int w_out = index % output_width;
+    index /= output_width;
+    int h_out = index % output_height;
+    int channel_in = index / output_height;
+    int channel_out = channel_in * filter_height * filter_width;
+    int h_in = h_out * stride_height;
+    int w_in = w_out * stride_width;
+    data_col += (channel_out * output_height + h_out) * output_width + w_out;
+    for (int i = 0; i < filter_height; ++i) {
+      for (int j = 0; j < filter_width; ++j) {
+        int rIdx = int(h_in + i);
+        int cIdx = int(w_in + j);
+        if ((rIdx - (int)padding_height) >= (int)height ||
+            (rIdx - (int)padding_height) < 0 ||
+            (cIdx - (int)padding_width) >= (int)width ||
+            (cIdx - (int)padding_width) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in * height - padding_height;
+          cIdx = cIdx - padding_width;
+          *data_col = data_im[rIdx * width + cIdx];
+        }
+        data_col += output_height * output_width;
+      }
+    }
+  }
+}
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::GPUPlace, T> {
+ public:
+  void operator()(const framework::Tensor& im, framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int output_height = col.dims()[3];
+    int output_width = col.dims()[4];
+    int num_outputs = input_channels * output_height * output_width;
+    int blocks = (num_outputs + 1024 - 1) / 1024;
+    int block_x = 512;
+    int block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+    im2col<T><<<
+        grid, threads, 0,
+        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+        im.data<T>(), num_outputs, input_height, input_width, filter_height,
+        filter_width, stride_height, stride_width, padding_height,
+        padding_width, output_height, output_width, col.data<T>());
+  }
+};
+template <class T>
+__global__ void col2im(size_t n, const T* data_col, size_t height, size_t width,
+                       size_t channels, size_t filter_height,
+                       size_t filter_width, size_t stride_height,
+                       size_t stride_width, size_t padding_height,
+                       size_t padding_width, size_t output_height,
+                       size_t output_width, T* data_im) {
+  size_t index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)padding_width) >= 0 &&
+        (w - (int)padding_width) < (width - 2 * padding_width) &&
+        (h - (int)padding_height) >= 0 &&
+        (h - padding_height) < (height - 2 * padding_height)) {
+      // compute the start and end of the output
+      int w_col_start = (w < (int)filter_width)
+                            ? 0
+                            : (w - int(filter_width)) / (int)stride_width + 1;
+      int w_col_end =
+          min((int)(w / (int)stride_width + 1), (int)(output_width));
+      int h_col_start = (h < (int)filter_height)
+                            ? 0
+                            : (h - (int)filter_height) / (int)stride_height + 1;
+      int h_col_end = min(int(h / stride_height + 1), int(output_height));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * filter_height * filter_width) +
+                      (h - h_col * (int)stride_height) * (int)filter_width +
+                      (w - w_col * (int)stride_width);
+          val +=
+              data_col[(c_col * output_height + h_col) * output_width + w_col];
+        }
+      }
+      h -= padding_height;
+      w -= padding_width;
+      data_im[c * ((width - 2 * padding_width) *
+                   (height - 2 * padding_height)) +
+              h * (width - 2 * padding_width) + w] += val;
+    }
+  }
+}
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::GPUPlace, T> {
+ public:
+  void operator()(framework::Tensor& im, const framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int output_height = col.dims()[3];
+    int output_width = col.dims()[4];
+    size_t num_kernels = input_channels * (input_height + 2 * padding_height) *
+                         (input_width + 2 * padding_width);
+    size_t blocks = (num_kernels + 1024 - 1) / 1024;
+    size_t block_x = 512;
+    size_t block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<<
+        grid, threads, 0,
+        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+        num_kernels, col.data<T>(), input_height + 2 * padding_height,
+        input_width + 2 * padding_width, input_channels, filter_height,
+        filter_width, stride_height, stride_width, padding_height,
+        padding_width, output_height, output_width, im.data<T>());
+  }
+};
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::GPUPlace, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::GPUPlace, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::GPUPlace, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::GPUPlace, double>;
+template <class T>
+__global__ void im2colOCF(const T* im_data, T* col_data, int input_channels,
+                          int input_height, int input_width, int filter_height,
+                          int filter_width, int stride_height, int stride_width,
+                          int padding_height, int padding_width,
+                          int output_height, int output_width) {
+  int swid = blockIdx.x;
+  int shid = blockIdx.y;
+  for (int channelid = threadIdx.z; channelid < input_channels;
+       channelid += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
+        int width_offset = idx + swid * stride_width - padding_width;
+        int height_offset = idy + shid * stride_height - padding_height;
+        int im_offset = width_offset + height_offset * input_width +
+                        channelid * input_height * input_width;
+        int col_offset = idx + idy * filter_width +
+                         channelid * filter_height * filter_width +
+                         (shid * output_width + swid) *
+                             (input_channels * filter_height * filter_width);
+        if (height_offset >= input_height || height_offset < 0 ||
+            width_offset >= input_width || width_offset < 0) {
+          col_data[col_offset] = T(0);
+        } else {
+          col_data[col_offset] = im_data[im_offset];
+        }
+      }
+    }
+  }
+}
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::GPUPlace, T> {
+ public:
+  void operator()(const framework::Tensor& im, framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int output_height = col.dims()[0];
+    int output_width = col.dims()[1];
+    int block_dim_x = 0;
+    int block_dim_y = 0;
+    if (filter_height <= 4 && filter_width <= 4) {
+      block_dim_x = 4;
+      block_dim_y = 4;
+    } else if (filter_height <= 8 && filter_width <= 8) {
+      block_dim_x = 8;
+      block_dim_y = 8;
+    } else if (filter_height <= 16 && filter_width <= 16) {
+      block_dim_x = 16;
+      block_dim_y = 16;
+    } else {
+      block_dim_x = 32;
+      block_dim_y = 32;
+    }
+    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    dim3 threads(block_dim_x, block_dim_y,
+                 std::min(block_dim_z, input_channels));
+    dim3 grid(output_width, output_height);
+    im2colOCF<T><<<
+        grid, threads, 0,
+        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+        im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
+        filter_height, filter_width, stride_height, stride_width,
+        padding_height, padding_width, output_height, output_width);
+  }
+};
+template <class T>
+__global__ void col2imOCF(T* im_data, const T* col_data, int input_channels,
+                          int input_height, int input_width, int filter_height,
+                          int filter_width, int stride_height, int stride_width,
+                          int padding_height, int padding_width,
+                          int output_height, int output_width) {
+  int swid = blockIdx.x;
+  int shid = blockIdx.y;
+  for (int channelid = threadIdx.z; channelid < input_channels;
+       channelid += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
+        int width_offset = idx + swid * stride_width - padding_width;
+        int height_offset = idy + shid * stride_height - padding_height;
+        int im_offset = width_offset + height_offset * input_width +
+                        channelid * input_height * input_width;
+        int col_offset = idx + idy * filter_width +
+                         channelid * filter_height * filter_width +
+                         (shid * output_width + swid) *
+                             (input_channels * filter_height * filter_width);
+        if (height_offset >= 0 && height_offset < input_height &&
+            width_offset >= 0 && width_offset < input_width) {
+          paddle::platform::CudaAtomicAdd(im_data + im_offset,
+                                          col_data[col_offset]);
+        }
+      }
+    }
+  }
+}
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::GPUPlace, T> {
+ public:
+  void operator()(framework::Tensor& im, const framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int input_channels = im.dims()[0];
+    int input_height = im.dims()[1];
+    int input_width = im.dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int output_height = col.dims()[0];
+    int output_width = col.dims()[1];
+    int block_dim_x = 0;
+    int block_dim_y = 0;
+    if (filter_height <= 4 && filter_width <= 4) {
+      block_dim_x = 4;
+      block_dim_y = 4;
+    } else if (filter_height <= 8 && filter_width <= 8) {
+      block_dim_x = 8;
+      block_dim_y = 8;
+    } else if (filter_height <= 16 && filter_width <= 16) {
+      block_dim_x = 16;
+      block_dim_y = 16;
+    } else {
+      block_dim_x = 32;
+      block_dim_y = 32;
+    }
+    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    dim3 threads(block_dim_x, block_dim_y,
+                 std::min(block_dim_z, input_channels));
+    dim3 grid(output_width, output_height);
+    col2imOCF<T><<<
+        grid, threads, 0,
+        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+        im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
+        filter_height, filter_width, stride_height, stride_width,
+        padding_height, padding_width, output_height, output_width);
+  }
+};
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::GPUPlace, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::GPUPlace, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::GPUPlace, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::GPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum class ColFormat { kCFO = 0, kOCF = 1 };
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [input_channels, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [input_channels, filter_height, filter_width, output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_height * filter_width, and the width is equal
+ * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_height,
+ *      output_width]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [output_height, output_width, input_channels, filter_height, filter_width]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seq_length, step_size], where the seq_length
+ * is equal output_height * output_width, and the step_size is equal
+ * input_channels * filter_height * filter_width.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [output_height,
+ *      output_width,
+ *      input_channels,    ======>    [seqLength, stepSize]
+ *      filter_height,
+ *      filter_width]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, typename Place, typename T>
+class Im2ColFunctor {
+ public:
+  void operator()(const framework::Tensor& im, framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context);
+};
+template <ColFormat Format, typename Place, typename T>
+class Col2ImFunctor {
+ public:
+  void operator()(framework::Tensor& im, const framework::Tensor& col,
+                  int stride_height, int stride_width, int padding_height,
+                  int padding_width, platform::DeviceContext* context);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/im2col.h"
+#include <gtest/gtest.h>
+#include <iostream>
+template <typename Place>
+void testIm2col() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+  /**
+   * input = [0, 1, 2,
+   *          3, 4, 5]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5]
+   *
+   * output_ocf = [0, 1, 3, 4
+   *               1, 2, 4, 5]
+   */
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  int stride = 1;
+  int padding = 0;
+  int output_height = (input_height - filter_size + 2 * padding) / stride + 1;
+  int output_width = (input_width - filter_size + 2 * padding) / stride + 1;
+  float* input_ptr = input_tmp.mutable_data<float>(
+      {1, input_height, input_width}, paddle::platform::CPUPlace());
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr, 6 * sizeof(float));
+  auto* place = new Place();
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    input.CopyFrom<float>(input_tmp, *place);
+  }
+  output_cfo.mutable_data<float>(
+      {1, filter_size, filter_size, output_height, output_width}, *place);
+  output_ocf.mutable_data<float>(
+      {output_height, output_width, 1, filter_size, filter_size}, *place);
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO, Place, float>
+      im2col;
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kOCF, Place, float>
+      im2col_ocf;
+  paddle::platform::DeviceContext* context;
+  if (paddle::platform::is_cpu_place(*place)) {
+    context =
+        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
+  } else {
+#ifndef PADDLE_ONLY_CPU
+    context =
+        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
+#else
+    PADDLE_THROW("no GPU support");
+#endif  // PADDLE_ONLY_CPU
+  }
+  im2col(input, output_cfo, stride, stride, padding, padding, context);
+  im2col_ocf(input, output_ocf, stride, stride, padding, padding, context);
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace());
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+  EXPECT_EQ(out_cfo_ptr[0], 0);
+  EXPECT_EQ(out_cfo_ptr[1], 1);
+  EXPECT_EQ(out_cfo_ptr[2], 1);
+  EXPECT_EQ(out_cfo_ptr[3], 2);
+  EXPECT_EQ(out_cfo_ptr[4], 3);
+  EXPECT_EQ(out_cfo_ptr[5], 4);
+  EXPECT_EQ(out_cfo_ptr[6], 4);
+  EXPECT_EQ(out_cfo_ptr[7], 5);
+  float* out_ocf_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_ocf_ptr = output_ocf.data<float>();
+  } else {
+    output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace());
+    out_ocf_ptr = output_tmp.data<float>();
+  }
+  EXPECT_EQ(out_ocf_ptr[0], 0);
+  EXPECT_EQ(out_ocf_ptr[1], 1);
+  EXPECT_EQ(out_ocf_ptr[2], 3);
+  EXPECT_EQ(out_ocf_ptr[3], 4);
+  EXPECT_EQ(out_ocf_ptr[4], 1);
+  EXPECT_EQ(out_ocf_ptr[5], 2);
+  EXPECT_EQ(out_ocf_ptr[6], 4);
+  EXPECT_EQ(out_ocf_ptr[7], 5);
+}
+TEST(math, im2col) {
+  testIm2col<paddle::platform::CPUPlace>();
+#ifndef PADDLE_ONLY_CPU
+  testIm2col<paddle::platform::GPUPlace>();
+#endif
+}
\ No newline at end of file
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -25,18 +25,27 @@ class MulOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+    int x_num_col_dims = Attr<int>("x_num_col_dims");
-                      "input X(%s) should be a tensor with 2 dims, a matrix",
+    int y_num_col_dims = Attr<int>("y_num_col_dims");
-                      ctx.op().Input("X"));
-    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+    PADDLE_ENFORCE(x_dims.size() > x_num_col_dims,
-                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                   "The rank of input tensor X(%s) should be larger than "
-                      ctx.op().Input("Y"));
+                   "`mul_op`'s `x_num_col_dims`.",
+                   ctx.op().Input("X"));
+    PADDLE_ENFORCE(y_dims.size() > y_num_col_dims,
+                   "The rank of input tensor Y(%s) should be larger than "
+                   "`mul_op`'s `y_num_col_dims`.",
+                   ctx.op().Input("Y"));
+    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
    PADDLE_ENFORCE_EQ(
-        dim0[1], dim1[0],
+        x_mat_dims[1], y_mat_dims[0],
        "First matrix's width must be equal with second matrix's height.");
-    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+    ctx.Output<Tensor>("Out")->Resize({x_mat_dims[0], y_mat_dims[1]});
  }
 };
@@ -47,6 +56,23 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "The first input of mul op");
    AddInput("Y", "The second input of mul op");
    AddOutput("Out", "The output of mul op");
+    AddAttr<int>(
+        "x_num_col_dims",
+        R"DOC(mul_op can take tensors with more than two dimensions as input `X`, 
+            in that case, tensors will be reshaped to a matrix. The matrix's first 
+            dimension(column length) will be the product of tensor's last 
+            `num_col_dims` dimensions, and the matrix's second dimension(row length)
+            will be the product of tensor's first `rank - num_col_dims` dimensions.
+        )DOC")
+        .SetDefault(1)
+        .EqualGreaterThan(1);
+    AddAttr<int>(
+        "y_num_col_dims",
+        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
+             in that case, tensors will be reshaped to a matrix. Just like input `X`.
+        )DOC")
+        .SetDefault(1)
+        .EqualGreaterThan(1);
    AddComment(R"DOC(
 Two Element Mul Operator.
@@ -70,10 +96,20 @@ class MulOpGrad : public framework::OperatorWithKernel {
    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    PADDLE_ENFORCE(x_dims[0] == out_dims[0],
-                   "Out@GRAD M X N must equal to X dims 0, M ");
+    auto x_mat_dims =
-    PADDLE_ENFORCE(y_dims[1] == out_dims[1],
+        framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));
-                   "Out@GRAD M X N must equal to Y dims 1, N ");
+    auto y_mat_dims =
+        framework::flatten_to_2d(y_dims, Attr<int>("y_num_col_dims"));
+    PADDLE_ENFORCE_EQ(
+        x_mat_dims[0], out_dims[0],
+        "The first dimension of Out@GRAD must equal to the first dimension of "
+        "the first operand.");
+    PADDLE_ENFORCE_EQ(
+        y_mat_dims[1], out_dims[1],
+        "The second dimension of Out@GRAD must equal to the second "
+        "dimension of the second operand.");
    if (x_grad) x_grad->Resize(x_dims);
    if (y_grad) y_grad->Resize(y_dims);

--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+   You may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
@@ -31,13 +31,25 @@ template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
+    const Tensor* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
+    const Tensor* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix<T>(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix<T>(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
    z->mutable_data<T>(context.GetPlace());
    auto* device_context =
        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*x, false, *y, false, 1, z, 0, device_context);
+    math::matmul<Place, T>(x_matrix, false, y_matrix, false, 1, z, 0,
+                           device_context);
  }
 };
@@ -45,23 +57,39 @@ template <typename Place, typename T>
 class MulGradKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-    auto* y = ctx.Input<Tensor>("Y");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor x_matrix =
+        x->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*x, x_num_col_dims)
+                             : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*y, y_num_col_dims)
+                             : *y;
+    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    auto* device_context =
        const_cast<platform::DeviceContext*>(ctx.device_context_);
    if (dx) {
      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
+                                                     *dx, x_num_col_dims)
+                                               : *dx;
      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<Place, T>(*dout, false, *y, true, 1, dx, 0, device_context);
+      math::matmul<Place, T>(*dout, false, y_matrix, true, 1, &dx_matrix, 0,
+                             device_context);
    }
    if (dy) {
      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
+                                                     *dy, y_num_col_dims)
+                                               : *dy;
      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<Place, T>(*x, true, *dout, false, 1, dy, 0, device_context);
+      math::matmul<Place, T>(x_matrix, true, *dout, false, 1, &dy_matrix, 0,
+                             device_context);
    }
  }
 };

--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -25,14 +25,19 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto dim1 = ctx.Input<Tensor>("b")->dims();
+    auto b_dims = ctx.Input<Tensor>("b")->dims();
+    PADDLE_ENFORCE_GT(
-    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+        x_dims.size(), b_dims.size(),
-    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+        "The rank of input `X` must be larger than the one of input `b`.");
-    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
-    PADDLE_ENFORCE(ctx.OutputSize("Out") == 1, "The output size must be 1");
+    int num_col_dims = x_dims.size() - b_dims.size();
-    ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+    PADDLE_ENFORCE_EQ(
+        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
+        "The width of two operands must be same");
+    PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
+    ctx.Output<Tensor>("Out")->Resize(x_dims);
  }
 };
@@ -61,13 +66,20 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"), "b should not be null");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                            "Input(Out@GRAD) should not be null");
-    auto dims0 = ctx.Input<Tensor>("X")->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto dims1 = ctx.Input<Tensor>("b")->dims();
+    auto b_dims = ctx.Input<Tensor>("b")->dims();
-    PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1")
+    PADDLE_ENFORCE_GT(
+        x_dims.size(), b_dims.size(),
+        "The rank of input `X` must be larger than the one of input `b`.");
+    int num_col_dims = x_dims.size() - b_dims.size();
+    PADDLE_ENFORCE_EQ(
+        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
+        "The width of two operands must be same");
    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *db = ctx.Output<Tensor>(framework::GradVarName("b"));
-    if (dx) dx->Resize(dims0);
+    if (dx) dx->Resize(x_dims);
-    if (db) db->Resize(dims1);
+    if (db) db->Resize(b_dims);
  }
 };

--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -33,10 +33,12 @@ class RowwiseAddKernel : public framework::OpKernel {
  void Compute(const framework::ExecutionContext& context) const override {
    auto out = context.Output<Tensor>("Out");
    out->mutable_data<T>(context.GetPlace());
+    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
-    auto input = EigenMatrix<T>::From(*context.Input<Tensor>("X"));
+                       context.Input<Tensor>("b")->dims().size();
-    auto bias = EigenVector<T>::From(*context.Input<Tensor>("b"));
+    auto input =
-    auto output = EigenMatrix<T>::From(*out);
+        EigenMatrix<T>::Reshape(*context.Input<Tensor>("X"), num_col_dims);
+    auto bias = EigenVector<T>::Flatten(*context.Input<Tensor>("b"));
+    auto output = EigenMatrix<T>::Reshape(*out, num_col_dims);
    const int bias_size = bias.dimension(0);
    const int rest_size = input.size() / bias_size;
@@ -54,12 +56,15 @@ class RowwiseAddGradKernel : public framework::OpKernel {
    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
+    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
+                       context.Input<Tensor>("b")->dims().size();
-    auto out_grad = EigenMatrix<T>::From(*dout);
+    auto out_grad = EigenMatrix<T>::Reshape(*dout, num_col_dims);
    auto place = context.GetEigenDevice<Place>();
    if (dx) {
      dx->mutable_data<T>(context.GetPlace());
-      EigenMatrix<T>::From(*dx).device(place) = out_grad;
+      EigenMatrix<T>::Reshape(*dx, num_col_dims).device(place) = out_grad;
    }
    if (db) {

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -44,11 +44,13 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 The equation is: Out = scale*X
 )DOC");
-    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+    AddAttr<AttrType>("scale", "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
  }
 };
-// Scale Op's gradient is scale op, too.
+// The operator to calculate gradients of a scale operator is just the scale
+// operator itself.
 // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
 template <typename AttrType>
 class ScaleGradOp : public NetOp {

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -51,7 +51,7 @@ the other dimensions in the K-dimensional vector input. Then the ratio of the
 exponential of the given dimension and the sum of exponential values of all
 the other dimensions is the output of the softmax operator.
-For each row `i` and each column `j` in X, we have:
+For each row `i` and each column `j` in input X, we have:
    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
 )DOC");
@@ -64,14 +64,15 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should be not null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
-                            "Input(Y@GRAD) should not be null");
+                            "Input(Y@GRAD) should be not null.");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() ==
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Y")->dims(),
-                       ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
+                      ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
-                   "the shape of Input(0) and Input(1) should be the same");
+                      "Input(Y) and its gradients should have a same shape.");
    ctx.Output<Tensor>(framework::GradVarName("X"))
-        ->Resize(ctx.Input<Tensor>("Y")->dims());
+        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };

--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -28,12 +28,12 @@ template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto input = context.Input<Tensor>("X");
+    auto X = context.Input<Tensor>("X");
-    auto output = context.Output<Tensor>("Y");
+    auto Y = context.Output<Tensor>("Y");
-    output->mutable_data<T>(context.GetPlace());
+    Y->mutable_data<T>(context.GetPlace());
-    auto logits = EigenMatrix<T>::From(*input);
+    auto logits = EigenMatrix<T>::From(*X);
-    auto softmax = EigenMatrix<T>::From(*output);
+    auto softmax = EigenMatrix<T>::From(*Y);
    const int kBatchDim = 0;
    const int kClassDim = 1;

--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/top_k_op.h"
+namespace paddle {
+namespace operators {
+class TopkOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input of TopkOP must be initialized.");
+    auto *input = ctx.Input<framework::Tensor>("X");
+    const int k = static_cast<int>(ctx.Attr<int>("k"));
+    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
+    PADDLE_ENFORCE_GE(input->dims().size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input->dims()[input->dims().size() - 1], k,
+                      "input must have >= k columns");
+    framework::DDim dims = input->dims();
+    dims[dims.size() - 1] = k;
+    ctx.Output<Tensor>("Out")->Resize(dims);
+    ctx.Output<Tensor>("Indices")->Resize(dims);
+  }
+};
+class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TopkOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of Topk op");
+    AddOutput("Out", "The output tensor of Topk op");
+    AddOutput("Indices", "The indices of Topk elements of input");
+    AddComment(
+        R"DOC(If the input is a vector (1d tensor), finds the k largest entries in the vector and outputs their values and indices as vectors. Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+    For matrices, computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "Number of top elements to look for along the last "
+                 "dimension (along each row for matrices).")
+        .SetDefault(1);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(top_k, ops::TopkOp, ops::TopkOpMaker);
+REGISTER_OP_CPU_KERNEL(top_k,
+                       ops::TopkKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int id) : v(value), id(id) {}
+  __device__ __forceinline__ void set(T value, int id) {
+    v = value;
+    id = id;
+  }
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+  T v;
+  int id;
+};
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (topk[k] < p) {
+      topk[k + 1] = topk[k];
+    } else {
+      topk[k + 1] = p;
+      return;
+    }
+  }
+  topk[0] = p;
+}
+template <typename T, int beam_size>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (topk[k] < p) {
+      topk[k + 1] = topk[k];
+    } else {
+      topk[k + 1] = p;
+      return;
+    }
+  }
+  topk[0] = p;
+}
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < src[idx]) {
+      Pair<T> tmp(src[idx], idx);
+      AddTo<T>(topk, tmp, beam_size);
+    }
+    idx += BlockSize;
+  }
+}
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < src[idx]) {
+      Pair<T> tmp(src[idx], idx);
+      if (tmp < max) {
+        AddTo<T>(topk, tmp, beam_size);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
+                                        int idx, int dim, int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < val[idx]) {
+      Pair<T> tmp(val[idx], col[idx]);
+      AddTo<T>(topk, tmp, beam_size);
+    }
+    idx += BlockSize;
+  }
+}
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
+                                        int idx, int dim, const Pair<T>& max,
+                                        int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < val[idx]) {
+      Pair<T> tmp(val[idx], col[idx]);
+      if (tmp < max) {
+        AddTo<T>(topk, tmp, beam_size);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+                                              int beam_size, const T* src,
+                                              bool& firstStep, bool& is_empty,
+                                              Pair<T>& max, int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beam_size ? beam : beam_size;
+    if (firstStep) {
+      firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - beam) {
+          topk[k] = topk[k + beam];
+        } else {
+          topk[k].set(-INFINITY, -1);
+        }
+      }
+      if (!is_empty) {
+        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+                              length);
+      }
+    }
+    max = topk[MaxLength - 1];
+    if (max.v == -1) is_empty = true;
+    beam = 0;
+  }
+}
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+                                              int beam_size, const T* val,
+                                              int* col, bool& firstStep,
+                                              bool& is_empty, Pair<T>& max,
+                                              int dim, const int tid) {
+  if (beam > 0) {
+    int length = beam < beam_size ? beam : beam_size;
+    if (firstStep) {
+      firstStep = false;
+      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - beam) {
+          topk[k] = topk[k + beam];
+        } else {
+          topk[k].set(-INFINITY, -1);
+        }
+      }
+      if (!is_empty) {
+        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+                              length);
+      }
+    }
+    max = topk[MaxLength - 1];
+    if (max.v == -1) is_empty = true;
+    beam = 0;
+  }
+}
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int** topIds, int& beam, int& k,
+                                            const int tid, const int warp) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+        maxid[tid] = tid + BlockSize / 2;
+      } else {
+        maxid[tid] = tid;
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+          maxid[tid] = maxid[tid + stride];
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) beam++;
+    if (--k == 0) break;
+    __syncthreads();
+    if (tid == maxid[0]) {
+      if (beam < MaxLength) {
+        sh_topk[tid] = topk[beam];
+      }
+    }
+    if (maxid[0] / 32 == warp) {
+      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+    }
+  }
+}
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int* indices,
+                             const T* src, int lds, int dim, int k) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  __shared__ int maxid[BlockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  output += blockIdx.x * output_stride;
+  indices += blockIdx.x * k;
+  Pair<T> topk[MaxLength];
+  int beam = MaxLength;
+  Pair<T> max;
+  bool is_empty = false;
+  bool firststep = true;
+  for (int k = 0; k < MaxLength; k++) {
+    topk[k].set(-INFINITY, -1);
+  }
+  while (k) {
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
+                                           src + blockIdx.x * lds, firststep,
+                                           is_empty, max, dim, tid);
+    sh_topk[tid] = topk[0];
+    BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
+                                         &indices, beam, k, tid, warp);
+  }
+}
+template <typename T>
+class TopkOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    // FIXME(typhoonzero): data is always converted to type T?
+    int* indices_data = indices->mutable_data<int>(ctx.GetPlace());
+    size_t input_height = input->dims()[0];
+    size_t input_width = input->dims()[1];
+    if (k > input_width) k = input_width;
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    // TODO(typhoonzero): launch kernel on specified stream.
+    // TODO(typhoonzero): refine this kernel.
+    dim3 threads(256, 1);
+    dim3 grid(input_height, 1);
+    KeMatrixTopK<T, 5, 256><<<grid, threads>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, int(k));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -49,6 +49,7 @@ USE_OP(minus);
 USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
+USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(reshape);

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -37,7 +37,7 @@ Configuring cmake in /paddle/build ...
      -DWITH_PYTHON=${WITH_PYTHON:-ON}
      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
      -DCUDNN_ROOT=/usr/
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
      -DWITH_TESTING=${WITH_TESTING:-ON}
      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ========================================

--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
--- a/paddle/scripts/travis/build_android.sh
+++ b/paddle/scripts/travis/build_android.sh
@@ -22,6 +22,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
      -DANDROID_ABI=armeabi-v7a \
      -DANDROID_ARM_NEON=ON \
      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
      -DWITH_C_API=ON \
      -DWITH_SWIG_PY=OFF \
      -DWITH_STYLE_CHECK=OFF \

--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -320,6 +320,9 @@ void loadFileList(const std::string& fileListFileName,
 }
 double getMemoryUsage() {
+#if defined(__ANDROID__)
+  return 0.0;
+#else
  FILE* fp = fopen("/proc/meminfo", "r");
  CHECK(fp) << "failed to fopen /proc/meminfo";
  size_t bufsize = 256 * sizeof(char);
@@ -357,6 +360,7 @@ double getMemoryUsage() {
  delete[] buf;
  double usedMem = 1.0 - 1.0 * (freeMem + bufMem + cacheMem) / totalMem;
  return usedMem;
+#endif
 }
 SyncThreadPool* getGlobalSyncThreadPool() {

--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -33,6 +33,13 @@ limitations under the License. */
 #include "Flags.h"
 #include "hl_gpu.h"
+#if defined(__ANDROID__) && (__ANDROID_API__ < 21)
+inline int rand_r(unsigned int* seedp) {
+  (void)seedp;
+  return rand();
+}
+#endif
 /**
 * Loop over the elements in a container
 * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,

--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/mnist.py
+++ b/python/paddle/v2/framework/tests/mnist.py
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
--- a/python/paddle/v2/framework/tests/test_lookup_table.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table.py
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py