Merge branch 'develop' into fix-graph-error

1e5925a2 · Peng Li · 53e8b1a6 · f8192f72 · 1e5925a2 · 1e5925a2
131 changed file
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 def process(settings, file_list):
    for i in xrange(1024):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class)
+        lab = random.randint(0, settings.num_class - 1)
        yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
+set -e
+unset OMP_NUM_THREADS MKL_NUM_THREADS
+export OMP_DYNAMIC="FALSE"
+export KMP_AFFINITY="granularity=fine,compact,0,0"
+function train() {
+  topology=$1
+  bs=$2
+  use_mkldnn=$3
+  if [ $3 == "True" ]; then
+    use_mkldnn=$3
+    thread=1
+    log="logs/${topology}-mkldnn-${bs}.log"
+  elif [ $3 == "False" ]; then
+    use_mkldnn=$3
+    thread=`nproc`
+    log="logs/${topology}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $3, use True or False."
+  fi
+  args="batch_size=${bs}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+}
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+#========= mkldnn =========#
+# vgg
+train vgg 64 True
+train vgg 128 True
+train vgg 256 True
+#========== mklml ===========#
+train vgg 64 False
+train vgg 128 False
+train vgg 256 False
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg('layer_num', int, 19)
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+img = data_layer(name='image', size=height * width * 3)
+def vgg_network(vgg_num=3):
+    tmp = img_conv_group(
+        input=img,
+        num_channels=3,
+        conv_padding=1,
+        conv_num_filter=[64, 64],
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_size=2,
+        pool_stride=2,
+        pool_type=MaxPooling())
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[128, 128],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(256)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(512)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+if layer_num == 16:
+    vgg = vgg_network(3)
+elif layer_num == 19:
+    vgg = vgg_network(4)
+else:
+    print("Wrong layer number.")
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=vgg, label=lab)
+outputs(loss)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
  endforeach()
  list(REMOVE_DUPLICATES libs_deps)
-  if(APPLE) # Use OSX's libtool to merge archives
+  # To produce a library we need at least one source file.
-    # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
-    # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
-    # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  if(APPLE) # Use OSX's libtool to merge archives
    # Make the generated dummy source file depended on all static input
    # libs. If input lib changes,the source file is touched
    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${dummyfile}
+    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
      DEPENDS ${libs})
    # Generate dummy staic lib
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
    target_link_libraries(${TARGET_NAME} ${libs_deps})
    foreach(lib ${libs})
@@ -130,11 +130,14 @@ function(merge_static_libs TARGET_NAME)
    endforeach()
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      )
  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
    foreach(lib ${libs})
-      set(objlistfile ${lib}.objlist) # list of objects in the input library
+      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
-      set(objdir ${lib}.objdir)
+      set(objdir ${target_DIR}/${lib}.objdir)
      add_custom_command(OUTPUT ${objdir}
        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
@@ -142,31 +145,32 @@ function(merge_static_libs TARGET_NAME)
      add_custom_command(OUTPUT ${objlistfile}
        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})
-      # Empty dummy source file that goes into merged library		
+      list(APPEND target_OBJS "${objlistfile}")
-      set(mergebase ${lib}.mergebase.c)		
-      add_custom_command(OUTPUT ${mergebase}		
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
-        DEPENDS ${objlistfile})		
-      list(APPEND mergebases "${mergebase}")
    endforeach()
-    add_library(${TARGET_NAME} STATIC ${mergebases})
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
    target_link_libraries(${TARGET_NAME} ${libs_deps})
    # Get the file name of the generated library
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
-    foreach(lib ${libs})
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
-        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
+        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
-        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${target_DIR})
-        WORKING_DIRECTORY ${lib}.objdir)
-    endforeach()
  endif()
 endfunction(merge_static_libs)
@@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
  else(cc_library_SRCS)
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
    else()
      message(FATAL "Please specify source file or library in cc_library.")
@@ -249,7 +253,7 @@ function(nv_library TARGET_NAME)
      foreach(source_file ${nv_library_SRCS})
        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        endif()
      endforeach()
      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -25,7 +25,7 @@ function(target_circle_link_libraries TARGET_NAME)
            endif()
        endforeach()
        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            if(IOS AND NOT IOS_ENABLE_BITCODE)
+            if(NOT IOS_ENABLE_BITCODE)
                list(APPEND LIBS "-undefined dynamic_lookup")
            endif()
        endif()
@@ -97,6 +97,10 @@ function(link_paddle_exe TARGET_NAME)
        target_link_libraries(${TARGET_NAME} log)
    endif(ANDROID)
+    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()

--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -158,17 +158,23 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
-7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+7. paddlepaddle\*.whl is not a supported wheel on this platform.
 ------------------------------------------------------------------------
-出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
-而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
 更新 :code:`pip` 包的方法是\:
 ..  code-block:: bash
    pip install --upgrade pip
+如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
+并对比是否和正在安装的后缀一致。
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
+如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
 8.  python相关的单元测试都过不了
 --------------------------------
@@ -310,7 +316,7 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 模型一直不收敛，发散到了一个数值特别大的地方。
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
-主要的解决办法是减小学习律或者对数据进行归一化处理。
+主要的解决办法是减小学习率或者对数据进行归一化处理。
 15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
 ------------------------------------------------------------------------
@@ -373,3 +379,136 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
    parameters = paddle.parameters.create(my_cost)
    parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+18. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
+19. PaddlePaddle如何输出多个层
+------------------------------
+* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
+..  code-block:: python
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
+..  code-block:: python
+    out = inferer.infer(input=data_batch, flatten_result=False, field=["value"])
+这里设置 :code:`flatten_result=False`，得到的输出结果是元素个数等于输出字段数的 :code:`list`，该 :code:`list` 的每个元素是由所有输出层相应字段结果组成的 :code:`list`，每个字段结果的类型是 :code:`numpy.array`。:code:`flatten_result` 的默认值为 :code:`True`，该情况下，PaddlePaddle会分别对每个字段将所有输出层的结果按行进行拼接，如果各输出层该字段 :code:`numpy.array` 结果的相应维数不匹配，程序将不能正常运行。
+20. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
+-------------------------------------------------------------
+* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
+* PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
+21. dropout 使用
+-----------------
+* 在PaddlePaddle中使用dropout有两种方式
+  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
+  ..  code-block:: python
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
+  ..  code-block:: python
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
+* PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
+* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
+22. 如何设置学习率退火（learning rate annealing）
+------------------------------------------------
+在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
+..  code-block:: python
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=0.75,
+        learning_rate_schedule="poly",)
+PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
+* "constant"
+  lr = learning_rate
+* "poly"
+  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+  其中，num_samples_processed为已训练样本数，下同。
+* "caffe_poly"
+  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+* "exp"
+  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+* "discexp"
+  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+* "linear"
+  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+* "manual"
+  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+  ..  code-block:: python
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
+* "pass_manual"
+  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+  ..  code-block:: python
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1:1.0,2:0.9,3:0.8",) 
+  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
+23. 出现 :code:`Duplicated layer name` 错误怎么办
+--------------------------------------------------
+出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
+# How to write a new operator
+ - [Background](#Background)
+ - [Implementing C++ Types](#Implementing_C++_Types)
+   - [Defining ProtoMaker](#Defining_ProtoMaker)
+   - [Defining Operator](#Defining_Operator)
+   - [Registering Operator](#Registering_Operator)
+   - [Compilation](#Compilation)
+ - [Python Binding](#Python_Binding)
+ - [Unit Tests](#Unit_Tests)
+## Background
+Here are the base types needed. For details, please refer to the design docs.
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
+- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+ Information           | Where is it defined
+--------------  | :----------------------
+OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
+Op definition           | `.cc` files
+Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+## Implementing C++ Types
+### 1. Defining Class ProtoMaker
+Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
+First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
+   - `framework::OpAttrChecker` is used to validate variable attributes.
+The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+There are two changes in this example:
+- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
+- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+### 2. Defining Operator
+The following code defines the interface for MulOp:
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
+  - 1). validate and error out early: it checks input data dimensions and types.
+  - 2). configures the tensor shape in the output.
+Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
+### 3. Defining OpKernel
+`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
+- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename T` denotes data type, such as `float` or `double`.
+`MulKernel` types need to rewrite the interface for `Compute`.
+- `Compute` takes one input variable `const framework::ExecutionContext& context`.
+- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
+- `Compute` implements the computation logics of an `OpKernel`.
+`MulKernel`'s implementation of `Compute` is as follows:
+  ```cpp
+  template <typename Place, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, `Eigen unsupported Tensor` module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
+The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
+### 4. Registering Operator
+- In `.cc` files, register forward and backward operator classes and the CPU kernel.
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+    ```
+   In that code block,
+    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+- Registering GPU Kernel in `.cu` files
+    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+    namespace ops = paddle::operators;
+    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_GPU_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    ```
+### 5. Compilation
+Run the following commands to compile.
+```
+make mul_op
+```
+## Python Binding
+The system will automatically bind to Python and link it to a generated library.
+## Unit Tests
+Unit tests include comparing a forward operator's implementations on different devices, comparing a backward operator's implementation on different devices, and a scaling test for the backward operator. Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
--- a/doc/survey/cluster_bootstrapping_tools.md
+++ b/doc/survey/cluster_bootstrapping_tools.md
+# Cluster bootstrapping tool survey
+## Abstract
+In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
+## Basic assumptions
+Here are some basic assumptions before we move on to  details
+1. You are an administrator of a bare metal machine cluster, which means:
+  * you have full control to each of the machines.
+  * you have full control to the network which machines are connected to.
+2. Machines can be booted from network with PEX or iPXE
+3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
+if your cluster is able to mark above items with checkmarks, then keep reading.
+## Comparing Sextant and Tectonic installer
+### Sextant
+Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. 
+#### Pros
+1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
+2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
+3. docker registry integrated.
+4. GPU machine took care of.
+### Cons
+1. k8s API server is not deployed with high availability in considering by default.
+2. No grouping support.
+3. No API interface, a one-off service.
+### Tectonic installer
+First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
+Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, 
+Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
+Matchbox's Approach is similar to Sexstant.
+### Pros
+1. supports grouping machines.
+2. supports running provisioning service in rtk. (not a big deal though).
+3. supports http/gRPC API interface.
+4. supports multi-template.
+### Cons
+1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
+2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
+## Conclusion
+Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
+## Appendix: General procedure to bring up a cluster
+It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
+1. setup a bootstrap machine with static IP in the cluster, which has following services:
+  * DHCP: assigns ip address for rest of the nodes.
+  * name service: to map node name to a IP
+  * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. 
+  * cluster config service: this is for providing cluster node with OS config via http
+  * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
+2. New node powers on, it will
+  * broadcast the request for an IP address
+  * DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
+  * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
+  * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
+  * then restart the node.
+For further understanding, following 2 links from Matchbox are some good readings:
+* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
+* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -62,6 +62,7 @@ if(ANDROID)
          LIBRARY DESTINATION lib/${ANDROID_ABI})
  execute_process(
    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
    OUTPUT_VARIABLE GIT_COMMITS_LIST
    RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -81,8 +82,7 @@ if(ANDROID)
      )"
  )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole
+  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
-          ARCHIVE DESTINATION lib)
  if(NOT IOS)
    install(TARGETS paddle_capi_shared DESTINATION lib)
  endif()

--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,74 +19,59 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-template <>
+static ProgramDesc* g_program_desc = nullptr;
-AttrType AttrTypeID<int>() {
-  return INT;
+ProgramDesc& GetProgramDesc() {
-}
+  if (g_program_desc == nullptr) {
-template <>
+    g_program_desc = new ProgramDesc();
-AttrType AttrTypeID<float>() {
+  }
-  return FLOAT;
+  return *g_program_desc;
-}
-template <>
-AttrType AttrTypeID<std::string>() {
-  return STRING;
-}
-template <>
-AttrType AttrTypeID<std::vector<int>>() {
-  return INTS;
-}
-template <>
-AttrType AttrTypeID<std::vector<float>>() {
-  return FLOATS;
-}
-template <>
-AttrType AttrTypeID<std::vector<std::string>>() {
-  return STRINGS;
-}
-template <>
-AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
-  return INT_PAIRS;
 }
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
  switch (attr_desc.type()) {
-    case paddle::framework::AttrType::INT: {
+    case framework::AttrType::BOOLEAN: {
+      return attr_desc.b();
+    }
+    case framework::AttrType::INT: {
      return attr_desc.i();
    }
-    case paddle::framework::AttrType::FLOAT: {
+    case framework::AttrType::FLOAT: {
      return attr_desc.f();
    }
-    case paddle::framework::AttrType::STRING: {
+    case framework::AttrType::STRING: {
      return attr_desc.s();
    }
-    case paddle::framework::AttrType::INTS: {
+    case framework::AttrType::BOOLEANS: {
+      std::vector<bool> val(attr_desc.bools_size());
+      for (int i = 0; i < attr_desc.bools_size(); ++i) {
+        val[i] = attr_desc.bools(i);
+      }
+      return val;
+    }
+    case framework::AttrType::INTS: {
      std::vector<int> val(attr_desc.ints_size());
      for (int i = 0; i < attr_desc.ints_size(); ++i) {
        val[i] = attr_desc.ints(i);
      }
      return val;
    }
-    case paddle::framework::AttrType::FLOATS: {
+    case framework::AttrType::FLOATS: {
      std::vector<float> val(attr_desc.floats_size());
      for (int i = 0; i < attr_desc.floats_size(); ++i) {
        val[i] = attr_desc.floats(i);
      }
      return val;
    }
-    case paddle::framework::AttrType::STRINGS: {
+    case framework::AttrType::STRINGS: {
      std::vector<std::string> val(attr_desc.strings_size());
      for (int i = 0; i < attr_desc.strings_size(); ++i) {
        val[i] = attr_desc.strings(i);
      }
      return val;
    }
-    case paddle::framework::AttrType::INT_PAIRS: {
+    case framework::AttrType::BLOCK: {
-      std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
+      return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
-      for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
-        val[i].first = attr_desc.int_pairs(i).first();
-        val[i].second = attr_desc.int_pairs(i).second();
-      }
-      return val;
    }
  }
  PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");

--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,15 +27,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+// The order should be as same as framework.proto
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>,
+                       std::vector<float>, std::vector<std::string>, bool,
-                       std::vector<std::pair<int, int>>>
+                       std::vector<bool>, BlockDesc*>
    Attribute;
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
+ProgramDesc& GetProgramDesc();
 template <typename T>
-AttrType AttrTypeID();
+inline AttrType AttrTypeID() {
+  Attribute tmp = T();
+  return static_cast<AttrType>(tmp.which() - 1);
+}
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc);

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -166,9 +166,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
-        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Src", {prefix}}},
+                                           {{"Y", {grad_input}}}, {}));
-                                           {{"Dst", {grad_input}}}, {}));
      }
      return false;
    });

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -127,8 +127,8 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
 public:
  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "x");
+    AddInput("X", "x");
-    AddOutput("Dst", "out");
+    AddOutput("Y", "out");
    AddComment("");
  }
 };
@@ -325,10 +325,10 @@ TEST(Backward, op_part_of_output_are_not_need) {
  auto &fill_zero = *net->ops_[0];
  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
-  ASSERT_EQ(1UL, fill_zero.Inputs("Src").size());
+  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
-  ASSERT_EQ("Z", fill_zero.Input("Src"));
+  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Dst").size());
+  ASSERT_EQ(1UL, fill_zero.Outputs("Y").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Dst"));
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y"));
  auto &d_many_out = *net->ops_[1];
  ASSERT_EQ("many_output_op_grad", d_many_out.Type());

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -22,14 +22,11 @@ enum AttrType {
  INTS = 3;
  FLOATS = 4;
  STRINGS = 5;
-  INT_PAIRS = 6;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
 }
-message IntPair {
-  required int32 first = 1;
-  required int32 second = 2;
-};
 // OpDesc describes an instance of a C++ framework::OperatorBase
 // derived class type.
 message OpDesc {
@@ -43,7 +40,9 @@ message OpDesc {
    repeated int32 ints = 6;
    repeated float floats = 7;
    repeated string strings = 8;
-    repeated IntPair int_pairs = 9;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
  };
  message Var {
@@ -100,7 +99,7 @@ enum DataType {
 message LoDTensorDesc {
  required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
  optional int32 lod_level = 3 [ default = 0 ];
 }
@@ -108,3 +107,12 @@ message VarDesc {
  required string name = 1;
  optional LoDTensorDesc lod_tensor = 2;
 }
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+}
+message ProgramDesc { repeated BlockDesc blocks = 1; }
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -72,20 +72,16 @@ bool operator==(const LoD& a, const LoD& b) {
  return true;
 }
-void LoDTensor::SliceLevels(size_t level_begin, size_t level_end) {
+void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
  auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
  lod_ = new_lod;
 }
-void LoDTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
+void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
-  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                              size_t elem_end) {
-                 NumLevels());
+  PADDLE_ENFORCE_LT(level, NumLevels());
-  PADDLE_ENFORCE(elem_begin < NumElements(level),
+  PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
-                 "element begin [%d] out of range [%d]", elem_begin,
+  PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
-                 NumElements(level));
-  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
-                 "element end [%d] out of range [%d]", elem_end,
-                 NumElements(level));
  auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
  lod_ = new_lod;

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -89,15 +89,15 @@ class LoDTensor : public Tensor {
  }
  /*
-   * Slice of levels[level_begin:level_end]
+   * Shrink levels[level_begin:level_end]
   */
-  void SliceLevels(size_t level_begin, size_t level_end);
+  void ShrinkLevels(size_t level_begin, size_t level_end);
  /*
-   * Slice of elements of a level, [elem_begin: elem_end]
+   * Shrink elements of a level, [elem_begin: elem_end]
   * @note: low performance in slice lod_.
   */
-  void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
+  void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
 private:
  LoD lod_;

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -56,11 +56,11 @@ TEST_F(LoDTensorTester, NumElements) {
  ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }
-TEST_F(LoDTensorTester, SliceLevels) {
+TEST_F(LoDTensorTester, ShrinkLevels) {
  // slice 1 level
  for (size_t level = 0; level < 3UL; ++level) {
    LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.SliceLevels(level, level + 1);
+    new_lod_tensor.ShrinkLevels(level, level + 1);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
@@ -68,7 +68,7 @@ TEST_F(LoDTensorTester, SliceLevels) {
  // slice 2 level
  for (size_t level = 0; level < 2UL; ++level) {
    LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.SliceLevels(level, level + 2);
+    new_lod_tensor.ShrinkLevels(level, level + 2);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
    ASSERT_EQ(new_lod_tensor.NumElements(1),
@@ -77,10 +77,10 @@ TEST_F(LoDTensorTester, SliceLevels) {
  }
 }
-TEST_F(LoDTensorTester, SliceInLevel) {
+TEST_F(LoDTensorTester, ShrinkInLevel) {
  size_t level = 0;
  LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.SliceInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 2);
  EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
@@ -89,7 +89,7 @@ TEST_F(LoDTensorTester, SliceInLevel) {
  level = 1;
  new_lod_tensor = lod_tensor_;
-  new_lod_tensor.SliceInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 2);
  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -60,8 +60,8 @@ std::string OperatorBase::Output(const std::string& name) const {
 const std::vector<std::string>& OperatorBase::Outputs(
    const std::string& name) const {
  auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output %s", type_,
+  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output called %s",
-                 name);
+                 type_, name);
  return it->second;
 }
@@ -207,23 +207,22 @@ const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
 }
 template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const {
-  auto* var = OutputVar(name);
+  auto var = OutputVar(name);
-  return var == nullptr ? nullptr : const_cast<Tensor*>(GetTensorFromVar(var));
+  return var == nullptr ? nullptr : var->GetMutable<LoDTensor>();
 }
 template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
    const std::string& name) const {
  auto names = op().Outputs(name);
  std::vector<Tensor*> res;
  res.reserve(names.size());
  std::transform(names.begin(), names.end(), std::back_inserter(res),
                 [&](const std::string& sub_name) {
-                   auto var = scope().FindVar(sub_name);
+                   auto var = scope_.FindVar(sub_name);
-                   return var == nullptr
+                   return var == nullptr ? nullptr
-                              ? nullptr
+                                         : var->GetMutable<LoDTensor>();
-                              : const_cast<Tensor*>(GetTensorFromVar(var));
                 });
  return res;
 }

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -212,9 +212,9 @@ class InferShapeContext {
    return res;
  }
-  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
    auto names = op_.Outputs(name);
-    std::vector<const Variable*> res;
+    std::vector<Variable*> res;
    res.reserve(names.size());
    std::transform(names.begin(), names.end(), std::back_inserter(res),
                   [this](const std::string& name) {
@@ -271,6 +271,20 @@ class InferShapeContext {
    return &var->Get<Tensor>();
  }
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, InputSize(in));
+    PADDLE_ENFORCE_LT(j, OutputSize(out));
+    auto* in_var = MultiInputVar(in)[i];
+    auto* out_var = MultiOutputVar(out)[j];
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+  }
 private:
  const OperatorBase& op_;
  const Scope& scope_;
@@ -283,6 +297,13 @@ template <>
 const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
    const std::string& name) const;
+template <>
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const;
+template <>
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
+    const std::string& name) const;
 template <typename T>
 struct EigenDeviceConverter;
@@ -315,38 +336,10 @@ class ExecutionContext : public InferShapeContext {
    return device_context_;
  }
-  // redefine Output function,
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  T* Output(const std::string& name) const {
-    auto var = OutputVar(name);
-    return var == nullptr ? nullptr : const_cast<T*>(&var->Get<T>());
-  }
-  // redefine MultiOutput function.
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
-    auto names = op().Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(res),
-        [&](const std::string& sub_name) { return Output<T>(sub_name); });
-    return res;
-  }
 private:
  const platform::DeviceContext& device_context_;
 };
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const;
 class OpKernel {
 public:
  /**

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,16 +29,19 @@ limitations under the License. */
 namespace paddle {
-namespace framework {
+namespace pybind {
 namespace details {
 template <bool less, size_t i, typename... args>
 struct CastToPyBufferImpl;
 }
+}  // namespace pybind
+namespace framework {
 class Tensor {
 public:
  template <bool less, size_t i, typename... args>
-  friend struct details::CastToPyBufferImpl;
+  friend struct pybind::details::CastToPyBufferImpl;
  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;
@@ -165,12 +168,6 @@ class Tensor {
  /*! points to dimensions of memory block. */
  DDim dims_;
-  /**
-   * A cache of the number of elements in a tensor.
-   * Would be 0 for an uninitialized tensor.
-   */
-  int64_t numel_;
  /**
   * @brief   A PlaceHolder may be shared by more than one tensor.
   *

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -147,13 +147,12 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
 inline Tensor& Tensor::Resize(const DDim& dims) {
  dims_ = dims;
-  numel_ = product(dims_);
  return *this;
 }
 inline const DDim& Tensor::dims() const { return dims_; }
-inline int64_t Tensor::numel() const { return numel_; }
+inline int64_t Tensor::numel() const { return product(dims_); }
 template <typename T>
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {

--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -100,6 +100,7 @@ public:
    if (cnt_ == act.value->getElementCnt()) {
      return;
    }
+    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
    cnt_ = act.value->getElementCnt();
    stream_.reset(new MKLDNNStream());
    auto eng = CPUEngine::Instance().getEngine();
@@ -110,7 +111,6 @@ public:
    float alpha = getAlpha();
    float beta = getBeta();
-    /// forward
    pipelineFwd_.clear();
    val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
    if (val_ == nullptr) {
@@ -152,6 +152,7 @@ public:
    if (!needResetBwd_) {
      return;
    }
+    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
    needResetBwd_ = false;
    mkldnn::algorithm algo = getAlgo(this->getName());
    float alpha = getBwdAlpha();

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -64,7 +64,7 @@ bool MKLDNNConvLayer::init(const LayerMap& layerMap,
  // create biases
  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
  }
  return true;
 }
@@ -251,22 +251,31 @@ void MKLDNNConvLayer::resetInValue(
  // create buffer and reorder if input value do not match
  cpuInVal_ = nullptr;
  cvtInVal_ = nullptr;
-  if (inputIsOnlyMKLDNN()) {
-    MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-    CHECK(dnnIn) << "Input should be MKLDNNMatrix";
+  CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
-    if (dnnIn->getPrimitiveDesc() != in->getPrimitiveDesc()) {
+  if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-      CHECK_EQ(dnnIn->getFormat(), format::nc);
+    in = dnnIn;
+    return;
+  }
+  if (dnnIn) {
+    if (dnnIn->getFormat() == format::nc) {
      CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
      // create a new one with nchw format and same data
      memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
      dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-      CHECK(dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc());
    }
-    in = dnnIn;
+    if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
+      in = dnnIn;
+      return;
+    }
+    cpuInVal_ = dnnIn;
+    in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
+    cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
+    CHECK(cvtInVal_) << "should not be emptry";
  } else {
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
    memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-    cpuInVal_ = MKLDNNMatrix::create(cpuIn, inDims, format::nchw, engine_);
+    cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
    if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
      // create new mkldnn matrix
      in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
@@ -535,7 +544,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
  } else {
    wgtValBwdData_ = wgtVal_;
  }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data"
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
                    << wgtValBwdData_->getFormat();
 }

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -49,7 +49,7 @@ bool MKLDNNFcLayer::init(const LayerMap& layerMap,
  // create biases
  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
  }
  return true;
 }
@@ -161,9 +161,16 @@ void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
 void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
                                      MKLDNNMatrixPtr& bias) {
+  format wgtFmt = format::oihw;
+  if (inVal_->getFormat() == format::nChw8c) {
+    wgtFmt = format::oIhw8i;
+  } else if (inVal_->getFormat() == format::nChw16c) {
+    wgtFmt = format::oIhw16i;
+  }
  wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, format::oihw, engine_);
+      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
  wgt->downSpatial();
+  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
  bias = (biases_ && biases_->getW())
             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -115,6 +115,7 @@ public:
      copySeqInfoToOutputs();
      size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
      if (inputElemenCnt_ != elemenCnt) {
+        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
        // reset when input total sizes changed, not only the batchsize
        inputElemenCnt_ = elemenCnt;
        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
@@ -142,6 +143,7 @@ public:
  void backward(const UpdateCallback& callback) override {
    if (needResetBwd_) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
      needResetBwd_ = false;
    }

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,6 +55,13 @@ function(op_library TARGET)
        set(pybind_flag 1)
    endif()
+    # activation_op contains several operators
+    if ("${TARGET}" STREQUAL "activation_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
+    endif()
    # pybind USE_NO_KERNEL_OP
    file(READ ${TARGET}.cc TARGET_CONTENT)
    string(REGEX MATCH "OperatorWithKernel" regex_result "${TARGET_CONTENT}")

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -39,7 +39,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(inference->dims()[0], label->dims()[0],
                      "inference size must be the same as label size");
-    ctx.Output<framework::LoDTensor>("Accuracy")->Resize({1});
+    ctx.Output<framework::Tensor>("Accuracy")->Resize({1});
+    ctx.ShareLoD("Inference", /*->*/ "Accuracy");
  }
 };
@@ -54,11 +55,15 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
    // TODO(typhoonzero): AddInput("Weight", ...
    AddOutput("Accuracy", "The accuracy of current batch");
-    AddComment(
+    AddComment(R"DOC(
-        R"DOC(Accuracy. It will print accuracy rate for classification.
+Accuracy. It will print accuracy rate for classification.
 The accuracy is:
 ..  math::
-accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})DOC");
+accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
+Both the input `Inference` and `Label` can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD with input `Inference`.
+)DOC");
  }
 };

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/activation_op.h"
+namespace paddle {
+namespace operators {
+class ActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<framework::Tensor>("Y")->Resize(
+        ctx.Input<framework::Tensor>("X")->dims());
+    ctx.ShareLoD("X", /*->*/ "Y");
+  }
+};
+class ActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<framework::Tensor>("Y")->dims());
+  }
+};
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sigmoid operator");
+    AddOutput("Y", "Output of Sigmoid operator");
+    AddComment("Sigmoid activation operator, sigmoid = 1 / (1 + exp(-x))");
+  }
+};
+class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Exp operator");
+    AddOutput("Y", "Output of Exp operator");
+    AddComment("Exp activation operator, exp(x) = e^x");
+  }
+};
+class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu operator");
+    AddOutput("Y", "Output of Relu operator");
+    AddComment("Relu activation operator, relu(x) = max(x, 0)");
+  }
+};
+class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Tanh operator");
+    AddOutput("Y", "Output of Tanh operator");
+    AddComment(
+        "Tanh activation operator, tanh = (exp(x) - exp(-x)) / (exp(x) + "
+        "exp(-x))");
+  }
+};
+class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sqrt operator");
+    AddOutput("Y", "Output of Sqrt operator");
+    AddComment("Sqrt activation operator, sqrt(x) = x^(1/2)");
+  }
+};
+class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AbsOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Abs operator");
+    AddOutput("Y", "Output of Abs operator");
+    AddComment("Abs activation operator, abs(x) = |x|");
+  }
+};
+class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReciprocalOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Reciprocal operator");
+    AddOutput("Y", "Output of Reciprocal operator");
+    AddComment("Reciprocal activation operator, reciprocal(x) = 1 / x");
+  }
+};
+class LogOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Log operator");
+    AddOutput("Y", "Output of Log operator");
+    AddComment("Log activation operator, log(x) = natural logarithm of x");
+  }
+};
+class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquareOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Square operator");
+    AddOutput("Y", "Output of Square operator");
+    AddComment("Square activation operator, square(x) = x^2");
+  }
+};
+template <typename AttrType>
+class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of BRelu operator");
+    AddOutput("Y", "Output of BRelu operator");
+    AddComment("BRelu activation operator, brelu = max(min(x, t_min), t_max)");
+    AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<AttrType>(0));
+    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<AttrType>(24));
+  }
+};
+template <typename AttrType>
+class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftReluOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of SoftRelu operator");
+    AddOutput("Y", "Output of SoftRelu operator");
+    AddComment(
+        "SoftRelu activation operator, soft_relu = log(1 + exp(max(min(x, "
+        "threshold), threshold)))");
+    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(static_cast<AttrType>(40));
+  }
+};
+template <typename AttrType>
+class PowOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Pow operator");
+    AddOutput("Y", "Output of Pow operator");
+    AddComment("Pow activation operator, pow(x, factor) = x^factor");
+    AddAttr<AttrType>("factor", "The exponential factor of Pow")
+        .SetDefault(static_cast<AttrType>(1));
+  }
+};
+template <typename AttrType>
+class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of STanh operator");
+    AddOutput("Y", "Output of STanh operator");
+    AddComment("STanh activation operator, stanh = b * tanh(a * x)");
+    AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(static_cast<AttrType>(2 / 3));
+    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(static_cast<AttrType>(1.7159));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::SigmoidFunctor<float>>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                            ops::SigmoidGradFunctor<float>>);
+REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    exp,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::ExpFunctor>);
+REGISTER_OP_CPU_KERNEL(exp_grad,
+                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
+                                                 float, ops::ExpGradFunctor>);
+REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(relu,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::ReluFunctor<float>>);
+REGISTER_OP_CPU_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                         ops::ReluGradFunctor<float>>);
+REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    tanh,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::TanhFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    tanh_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                         ops::TanhGradFunctor<float>>);
+REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sqrt,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::SqrtFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    sqrt_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                         ops::SqrtGradFunctor<float>>);
+REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    abs,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::AbsFunctor>);
+REGISTER_OP_CPU_KERNEL(abs_grad,
+                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
+                                                 float, ops::AbsGradFunctor>);
+REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
+            reciprocal_grad, ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(reciprocal,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::ReciprocalFunctor<float>>);
+REGISTER_OP_CPU_KERNEL(
+    reciprocal_grad,
+    ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                              ops::ReciprocalGradFunctor<float>>);
+REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    log,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::LogFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    log_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                        ops::LogGradFunctor<float>>);
+REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(square,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::SquareFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    square_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                           ops::SquareGradFunctor<float>>);
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(brelu,
+                       ops::BReluKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(brelu_grad,
+                       ops::BReluGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
+            soft_relu_grad, ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(soft_relu,
+                       ops::SoftReluKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(pow, ops::PowKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pow_grad,
+                       ops::PowGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(stanh,
+                       ops::STanhKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(stanh_grad,
+                       ops::STanhGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/activation_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(sigmoid,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::SigmoidFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                            ops::SigmoidGradFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    exp,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::ExpFunctor>);
+REGISTER_OP_GPU_KERNEL(exp_grad,
+                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
+                                                 float, ops::ExpGradFunctor>);
+REGISTER_OP_GPU_KERNEL(relu,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::ReluFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                         ops::ReluGradFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    tanh,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::TanhFunctor>);
+REGISTER_OP_GPU_KERNEL(
+    tanh_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                         ops::TanhGradFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    sqrt,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::SqrtFunctor>);
+REGISTER_OP_GPU_KERNEL(
+    sqrt_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                         ops::SqrtGradFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    abs,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::AbsFunctor>);
+REGISTER_OP_GPU_KERNEL(abs_grad,
+                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
+                                                 float, ops::AbsGradFunctor>);
+REGISTER_OP_GPU_KERNEL(reciprocal,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::ReciprocalFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    reciprocal_grad,
+    ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                              ops::ReciprocalGradFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    log,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::LogFunctor>);
+REGISTER_OP_GPU_KERNEL(
+    log_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                        ops::LogGradFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(square,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::SquareFunctor>);
+REGISTER_OP_GPU_KERNEL(
+    square_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                           ops::SquareGradFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(brelu,
+                       ops::BReluKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(brelu_grad,
+                       ops::BReluGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(soft_relu,
+                       ops::SoftReluKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pow_grad,
+                       ops::PowGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(stanh,
+                       ops::STanhKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(stanh_grad,
+                       ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T, typename Functor>
+class ActivationKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    Y->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    Functor functor;
+    functor(place, x, y);
+  }
+};
+template <typename Place, typename T, typename Functor>
+class ActivationGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Input<framework::Tensor>("Y");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    Functor functor;
+    functor(place, x, y, dy, dx);
+  }
+};
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+template <typename T>
+struct SigmoidGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * y * (static_cast<T>(1) - y);
+  }
+};
+// exp(x) = e^x
+struct ExpFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.exp();
+  }
+};
+struct ExpGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * y;
+  }
+};
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+template <typename T>
+struct ReluGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
+  }
+};
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+struct TanhFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.tanh();
+  }
+};
+template <typename T>
+struct TanhGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * (static_cast<T>(1) - y * y);
+  }
+};
+// sqrt(x) = x^(1/2)
+struct SqrtFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.sqrt();
+  }
+};
+template <typename T>
+struct SqrtGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    const Y y_conj = Eigen::numext::conj(y);
+    dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
+  }
+};
+// abs(x) = |x|
+struct AbsFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.abs();
+  }
+};
+struct AbsGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * x.sign();
+  }
+};
+// reciprocal(x) = 1 / x
+template <typename T>
+struct ReciprocalFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = static_cast<T>(1) / x;
+  }
+};
+template <typename T>
+struct ReciprocalGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * static_cast<T>(-1) * y * y;
+  }
+};
+// log(x) = natural logarithm of x
+struct LogFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.log();
+  }
+};
+template <typename T>
+struct LogGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * (static_cast<T>(1) / x);
+  }
+};
+// square(x) = x^2
+struct SquareFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.square();
+  }
+};
+template <typename T>
+struct SquareGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * static_cast<T>(2) * x;
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class BReluKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
+    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
+    Y->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.cwiseMax(t_min).cwiseMin(t_max);
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class BReluGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
+    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    dx.device(place) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class SoftReluKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
+    Y->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold).eval();
+    y.device(place) = (static_cast<T>(1) + temp.exp()).log();
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class SoftReluGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Input<framework::Tensor>("Y");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
+    dx.device(place) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class PowKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
+    Y->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.pow(factor);
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class PowGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    dx.device(place) = dy * factor * x.pow(factor - static_cast<T>(1));
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class STanhKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
+    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
+    Y->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = scale_b * (scale_a * x).tanh();
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class STanhGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
+    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
+    dx.device(place) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -33,7 +33,7 @@ class AddOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                      ctx.Input<Tensor>("Y")->dims(),
                      "Two input of Add Op's dimension must be same.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
        ctx.Input<Tensor>("X")->dims());
  }
 };

--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -17,8 +17,6 @@
 namespace paddle {
 namespace operators {
-using framework::LoDTensor;
 class ClipOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -29,11 +27,12 @@ class ClipOp : public framework::OperatorWithKernel {
                            "Input(X) of ClipOp should not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                            "Output(Out) of ClipOp should not be null.");
-    auto x_dims = ctx.Input<LoDTensor>("X")->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto max = Attr<float>("max");
    auto min = Attr<float>("min");
    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
-    ctx.Output<LoDTensor>("Out")->Resize(x_dims);
+    ctx.Output<Tensor>("Out")->Resize(x_dims);
+    ctx.ShareLoD("X", /*->*/ "Out");
  }
 };
@@ -66,8 +65,8 @@ class ClipOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<LoDTensor>("X")->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    if (x_grad != nullptr) {
      x_grad->Resize(x_dims);
    }

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -29,7 +29,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                            "Output(Out) of ConcatOp should not be null.");
    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
    size_t n = ins.size();

--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -37,7 +37,7 @@ class Conv2DOp : public framework::OperatorWithKernel {
    auto in = ctx.Input<Tensor>("Input");
    auto filter = ctx.Input<Tensor>("Filter");
-    auto out = ctx.Output<framework::LoDTensor>("Output");
+    auto out = ctx.Output<framework::Tensor>("Output");
    std::vector<int> strides = Attr<std::vector<int>>("strides");
    std::vector<int> paddings = Attr<std::vector<int>>("paddings");
    int groups = Attr<int>("groups");
@@ -111,10 +111,9 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
  void InferShape(const framework::InferShapeContext &ctx) const override {
    auto in = ctx.Input<Tensor>("Input");
    auto filter = ctx.Input<Tensor>("Filter");
-    auto d_in =
+    auto d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Input"));
    auto d_filter =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Filter"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Filter"));
    if (d_in) d_in->Resize(in->dims());
    if (d_filter) d_filter->Resize(filter->dims());
  }

--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -54,9 +54,10 @@ class CosSimOp : public framework::OperatorWithKernel {
                   " just 1 (which will be broadcasted to match Input(X)).");
    // resize tensor
-    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("XNorm")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.Output<framework::Tensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Out");
  }
 };
@@ -81,10 +82,13 @@ Cosine Similarity Operator.
 The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y)).
-Input(X) and Input(Y) must have the same shape, except that the 1st dimension
+The input `X` and `Y` must have the same shape, except that the 1st dimension
-of Input(Y) could be just 1 (different from Input(X)), which will be
+of input `Y` could be just 1 (different from input `X`), which will be
-broadcasted to match the shape of Input(X) before computing their cosine
+broadcasted to match the shape of input `X` before computing their cosine
 similarity.
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
  }
 };
@@ -139,10 +143,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
                      "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
    // resize tensor
-    auto *x_grad =
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
    if (x_grad) x_grad->Resize(x_dims);
    if (y_grad) y_grad->Resize(y_dims);
  }

--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -19,7 +19,6 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
-using framework::LoDTensor;
 class CropOp : public framework::OperatorWithKernel {
 public:
@@ -31,9 +30,9 @@ class CropOp : public framework::OperatorWithKernel {
                            "Input(X) of CropOp should not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                            "Output(Out) of CropOp should not be null.");
-    auto x_dim = ctx.Input<LoDTensor>("X")->dims();
+    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto *y = ctx.Input<LoDTensor>("Y");
+    auto *y = ctx.Input<Tensor>("Y");
-    auto *out = ctx.Output<LoDTensor>("Out");
+    auto *out = ctx.Output<Tensor>("Out");
    if (y == nullptr) {
      auto shape = Attr<std::vector<int>>("shape");
      PADDLE_ENFORCE_EQ(
@@ -121,8 +120,8 @@ class CropOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<LoDTensor>("X")->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    if (x_grad != nullptr) {
      x_grad->Resize(x_dims);
    }

--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -38,10 +38,10 @@ class CropKernel : public framework::OpKernel {
    auto out_stride = framework::stride(out->dims());
    auto offsets = context.Attr<std::vector<int>>("offsets");
    PADDLE_ENFORCE_EQ(
-        x->dims().size(), offsets.size(),
+        x->dims().size(), static_cast<int64_t>(offsets.size()),
        "Offsets size should be equal to dimension size of input tensor.");
    int64_t offset = 0;
-    for (int i = 0; i < offsets.size(); ++i) {
+    for (size_t i = 0; i < offsets.size(); ++i) {
      offset += (x_stride[i] * offsets[i]);
    }
    StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
@@ -57,7 +57,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
    d_x->mutable_data<T>(context.GetPlace());
    auto offsets = context.Attr<std::vector<int>>("offsets");
    Eigen::array<std::pair<int, int>, D> paddings;
-    for (int i = 0; i < D; ++i) {
+    for (size_t i = 0; i < D; ++i) {
      paddings[i].first = offsets[i];
      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
    }

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-using framework::LoDTensor;
 class CrossEntropyOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -35,23 +33,21 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
                      "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                      "The 1st dimension of Input(X) and Input(Label) must "
                      "be equal.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "If Attr(soft_label) == true, The 2nd dimension of "
                        "Input(X) and Input(Label) must be equal.");
    } else {
      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "If Attr(soft_label) == false, The 2nd dimension of "
                        "Input(Label) must be 1.");
    }
-    ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
+    ctx.Output<Tensor>("Y")->Resize({x->dims()[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Y");
  }
 };
@@ -74,9 +70,6 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
                      "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                      "The 1st dimension of Input(X) and Input(Label) must "
                      "be equal.");
@@ -85,17 +78,17 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                      "be equal.");
    PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
                      "The 2nd dimension of Input(Y@Grad) must be 1.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "If Attr(soft_label) == true, The 2nd dimension of "
                        "Input(X) and Input(Label) must be equal.");
    } else {
      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "If Attr(soft_label) == false, The 2nd dimension of "
                        "Input(Label) must be 1.");
    }
-    auto dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    dx->Resize(x->dims());
  }
 };
@@ -108,7 +101,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "The first input of CrossEntropyOp");
    AddInput("Label", "The second input of CrossEntropyOp");
    AddOutput("Y", "The output of CrossEntropyOp");
-    AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
+    AddAttr<bool>("soft_label", "Is soft label. Default zero.")
+        .SetDefault(false);
    AddComment(R"DOC(
 CrossEntropy Operator.
@@ -116,12 +110,12 @@ CrossEntropy Operator.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    soft_label = 0, Label[i, 0] indicates the class index for sample i:
+    soft_label = False, Label[i, 0] indicates the class index for sample i:
                Y[i] = -log(X[i, Label[i]])
 2) Soft-label cross-entropy:
-    soft_label = 1, Label[i, j] indicates the soft label of class j
+    soft_label = True, Label[i, j] indicates the soft label of class j
    for sample i:
                Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
@@ -133,6 +127,9 @@ computation.
     As a special case of 2), when each row of Input(Label) has only one
     non-zero element (equals 1), soft-label cross-entropy degenerates to a
     one-hot cross-entropy with one-hot label representation.
+Both the input `X` and `Label` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
  }
 };

--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -102,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
    int grid = (n + block - 1) / block;
    // TODO(qingqing) launch kernel on specified stream
    // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
      SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
                                                 d);
@@ -137,7 +137,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
    grid = (n + block - 1) / block;
    // TODO(qingqing): launch kernel on specified stream
    // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
      auto* label_data = label->data<T>();
      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
          dx_data, dy_data, x_data, label_data, n, d);

--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -51,7 +51,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
    int batch_size = x->dims()[0];
    int class_num = x->dims()[1];
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
      int index = 0;
      for (int i = 0; i < batch_size; ++i) {
@@ -92,7 +92,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
    int class_num = x->dims()[1];
    // TODO(qingqing): make zero setting an common function.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
      int index = 0;
      for (int i = 0; i < batch_size; ++i) {

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
-using framework::LoDTensor;
 class DropoutOp : public framework::OperatorWithKernel {
 public:
@@ -29,15 +28,13 @@ class DropoutOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
    PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
    auto dims = ctx.Input<Tensor>("X")->dims();
-    ctx.Output<LoDTensor>("Out")->Resize(dims);
+    ctx.Output<Tensor>("Out")->Resize(dims);
-    if (ctx.Attr<int>("is_training") == 1) {
+    if (ctx.Attr<bool>("is_training")) {
-      ctx.Output<LoDTensor>("Mask")->Resize(dims);
+      ctx.Output<Tensor>("Mask")->Resize(dims);
    }
+    ctx.ShareLoD("X", /*->*/ "Out");
  }
 };
@@ -49,8 +46,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
        .SetDefault(.5f);
-    // TODO(xinghai-sun): use bool for is_training after bool is supported.
+    AddAttr<bool>("is_training", "Whether in training phase.").SetDefault(true);
-    AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
    AddInput("X", "The input of dropout op.");
    AddOutput("Out", "The output of dropout op.");
@@ -59,7 +55,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Dropout Operator.
-"Dropout" refers to randomly dropping out units in a nerual network. It is a
+'Dropout' refers to randomly dropping out units in a nerual network. It is a
 regularization technique for reducing overfitting by preventing neuron
 co-adaption during training. The dropout operator randomly set (according to
 the given dropout probability) the outputs of some units to zero, while others
@@ -75,8 +71,8 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
+    PADDLE_ENFORCE(ctx.Attr<bool>("is_training"),
-                      "GradOp is only callable when is_training is true");
+                   "GradOp is only callable when is_training is true");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
@@ -85,9 +81,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
    PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
    PADDLE_ENFORCE_EQ(x_dims, out_dims,
@@ -96,7 +89,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
                      "Dimensions of Input(X) and Mask must be the same.");
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    x_grad->Resize(x_dims);
  }
 };

--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -59,7 +59,7 @@ class GPUDropoutKernel : public framework::OpKernel {
    auto Y = EigenMatrix<T>::Reshape(*y, 1);
    auto place = context.GetEigenDevice<Place>();
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
      auto* mask = context.Output<Tensor>("Mask");
      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
      int size = framework::product(mask->dims());

--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -35,7 +35,7 @@ class CPUDropoutKernel : public framework::OpKernel {
    auto* y_data = y->mutable_data<T>(context.GetPlace());
    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
      auto* mask = context.Output<Tensor>("Mask");
      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
      int seed = context.Attr<int>("seed");
@@ -65,8 +65,8 @@ template <typename Place, typename T>
 class DropoutGradKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
+    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
-                      "GradOp is only callable when is_training is true");
+                   "GradOp is only callable when is_training is true");
    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));

--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/elementwise_add_op.h"
+namespace paddle {
+namespace operators {
+class ElementwiseAddOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseAddOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("add", "Out = X + Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
+            elementwise_add_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sequence_avg_pool_op.cu
+++ b/paddle/operators/sequence_avg_pool_op.cu
@@ -13,13 +13,13 @@
   limitations under the License. */
 #define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_add_op.h"
-#include "paddle/operators/sequence_avg_pool_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    sequence_avg_pool,
+    elementwise_add,
-    ops::SequenceAvgPoolKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    sequence_avg_pool_grad,
+    elementwise_add_grad,
-    ops::SequenceAvgPoolGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/operators/elementwise_op.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class ElementwiseAddKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
+  }
+};
+template <typename T>
+struct ElementwiseAddGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e;
+    }
+  }
+};
+template <typename T>
+struct ElementwiseAddOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.sum();
+    }
+  }
+};
+template <typename T>
+struct ElementwiseAddBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+template <typename T>
+struct ElementwiseAddBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+template <typename Place, typename T>
+class ElementwiseAddGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
+                           ElementwiseAddOneGradFunctor<T>,
+                           ElementwiseAddBroadCastGradFunctor<T>,
+                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -12,46 +12,17 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/sigmoid_op.h"
+#include "paddle/operators/elementwise_div_op.h"
 namespace paddle {
 namespace operators {
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
-class SigmoidOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of SigmoidOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
-                            "Output(Y) of SigmoidOp should not be null.");
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
-        ctx.Input<Tensor>("X")->dims());
-  }
-};
-class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SigmoidOpMaker(framework::OpProto *proto,
+  ElementwiseDivOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker *op_checker)
+                        framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+      : ElementwiseOpMaker(proto, op_checker) {
-    AddInput("X", "sigmoid input");
+    SetComment("Div", "Out = X / Y");
-    AddOutput("Y", "sigmoid output");
+    AddComment(comment_);
-    AddComment("Sigmoid function");
-  }
-};
-class SigmoidOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
-        ->Resize(ctx.Input<Tensor>("Y")->dims());
  }
 };
@@ -59,9 +30,11 @@ class SigmoidOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, sigmoid_grad,
+REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
-            ops::SigmoidOpGrad);
+            elementwise_div_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(sigmoid,
+REGISTER_OP_CPU_KERNEL(
-                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_div_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/operators/elementwise_op.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class ElementwiseDivKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
+  }
+};
+template <typename T>
+struct ElementwiseDivGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
+    }
+  }
+};
+template <typename T>
+struct ElementwiseDivBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+template <typename T>
+struct ElementwiseDivBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+template <typename Place, typename T>
+class ElementwiseDivGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivBroadCastGradFunctor<T>,
+                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -17,101 +17,25 @@
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+class ElementwiseMulOpMaker : public ElementwiseOpMaker {
-class ElementWiseMulOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of ElementWiseMulOp should not be null.");
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto y_dim = ctx.Input<Tensor>("Y")->dims();
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
-    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dim);
-  }
-};
-class ElementWiseMulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ElementWiseMulOpMaker(framework::OpProto *proto,
+  ElementwiseMulOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker *op_checker)
+                        framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+      : ElementwiseOpMaker(proto, op_checker) {
-    AddInput("X", "The first input of elementwise mul op");
+    SetComment("Mul", "Out = X ⊙ Y");
-    AddInput("Y", "The second input of elementwise mul op");
+    AddComment(comment_);
-    AddAttr<int>("axis",
-                 R"DOC(
-When shape(Y) does not equal shape(X),Y will be broadcasted 
-to match the shape of X and axis should be dimension index Y in X
-        )DOC")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-    AddOutput("Out", "The output of elementwise mul op");
-    AddComment(R"DOC(
-Limited elementwise multiple operator.The equation is: Out = X ⊙ Y.
-1. The shape of Y should be same with X or
-2. Y's shape is a subset of X. 
-   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
-   example:
-      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-)DOC");
  }
 };
-class ElementWiseMulOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
-    if (x_grad) {
-      x_grad->Resize(x_dims);
-    }
-    if (y_grad) {
-      y_grad->Resize(y_dims);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementWiseMulOp, ops::ElementWiseMulOpMaker,
+REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementWiseMulOpGrad);
+            elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -19,7 +19,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
    elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -13,171 +13,104 @@
   limitations under the License. */
 #pragma once
-#include "paddle/framework/eigen.h"
+#include "paddle/operators/elementwise_op.h"
-#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
-/*
- * Out = X ⊙ Y
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- */
-inline void get_mid_dims(const framework::DDim& x_dims,
-                         const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
-  pre = 1;
-  n = 1;
-  post = 1;
-  for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
-                      "Broadcast dimension mismatch.");
-    n *= y_dims[i];
-  }
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
-  }
-}
 template <typename Place, typename T>
-class ElementWiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
+    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
+  }
-    auto* x = ctx.Input<Tensor>("X");
+};
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
+template <typename T>
+struct ElementwiseMulGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
    auto x_e = framework::EigenVector<T>::Flatten(*x);
    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto x_dims = x->dims();
+    if (dx) {
-    auto y_dims = y->dims();
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+      dx_e.device(d) = dz_e * y_e;
-                      "Rank of first input must >= rank of second input.")
-    if (x_dims == y_dims || product(y_dims) == 1) {
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_e;
-      return;
    }
-    int axis = ctx.Attr<int>("axis");
+    if (dy) {
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+      dy_e.device(d) = x_e * dz_e;
-                   "Axis should be in range [0, x_dims)");
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-    if (post == 1) {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
-    } else {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
    }
  }
 };
-template <typename Place, typename T>
+template <typename T>
-class ElementWiseMulGradKernel : public framework::OpKernel {
+struct ElementwiseMulBroadCastGradFunctor {
- public:
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
-  void Compute(const framework::ExecutionContext& ctx) const override {
+            typename dY, typename dZ, typename Pre, typename N>
-    using Tensor = framework::Tensor;
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
    auto x_e = framework::EigenVector<T>::Flatten(*x);
    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dout_e = framework::EigenVector<T>::Flatten(*dout);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto x_dims = x->dims();
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-    auto y_dims = y->dims();
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
    }
    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
    }
+  }
+};
-    if (x_dims == y_dims || product(y_dims) == 1) {
+template <typename T>
-      if (dx) {
+struct ElementwiseMulBroadCast2GradFunctor {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e;
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
-      }
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
-      if (dy) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
-        dy_e.device(ctx.GetEigenDevice<Place>()) = x_e * dout_e;
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-      }
-      return;
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
    }
-    int axis = ctx.Attr<int>("axis");
+    if (dy) {
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
-    int pre, n, post;
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+                           .sum(Eigen::array<int, 2>{{0, 2}});
-    // TODO(gongweibao): wrap reshape to a function.
-    if (post == 1) {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                           .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 2>(pre, n))
-                .sum(Eigen::array<int, 1>{{0}});
-      }
-      return;
-    } else {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                           .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                .sum(Eigen::array<int, 2>{{0, 2}});
-      }
-      return;
    }
  }
 };
+template <typename Place, typename T>
+class ElementwiseMulGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulBroadCastGradFunctor<T>,
+                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
+ */
+inline void get_mid_dims(const framework::DDim& x_dims,
+                         const framework::DDim& y_dims, const int axis,
+                         int& pre, int& n, int& post) {
+  pre = 1;
+  n = 1;
+  post = 1;
+  for (int i = 0; i < axis; ++i) {
+    pre *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
+                      "Broadcast dimension mismatch.");
+    n *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    post *= x_dims[i];
+  }
+}
+#define EIGEN_FUNCTOR(name, eigen_op)                                          \
+  struct Eigen##name##Functor {                                                \
+    template <typename Place, typename T>                                      \
+    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
+                    framework::Tensor* z,                                      \
+                    const framework::ExecutionContext& ctx) {                  \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast(const framework::Tensor* x,                       \
+                             const framework::Tensor* y, framework::Tensor* z, \
+                             const framework::ExecutionContext& ctx, int pre,  \
+                             int n) {                                          \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast2(const framework::Tensor* x,                      \
+                              const framework::Tensor* y,                      \
+                              framework::Tensor* z,                            \
+                              const framework::ExecutionContext& ctx, int pre, \
+                              int n, int post) {                               \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+  }
+template <class functor, typename Place, typename T>
+void ElementwiseCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.")
+  if (x_dims == y_dims || product(y_dims) == 1) {
+    functor f;
+    f.template Run<Place, T>(x, y, z, ctx);
+    return;
+  }
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor f;
+    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    return;
+  } else {
+    functor f;
+    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    return;
+  }
+}
+#define EIGEN_ADD(x, y) ((x) + (y))
+EIGEN_FUNCTOR(Add, EIGEN_ADD);
+#define EIGEN_SUB(x, y) ((x) - (y))
+EIGEN_FUNCTOR(Sub, EIGEN_SUB);
+#define EIGEN_MUL(x, y) ((x) * (y))
+EIGEN_FUNCTOR(Mul, EIGEN_MUL);
+#define EIGEN_DIV(x, y) ((x) / (y))
+EIGEN_FUNCTOR(Div, EIGEN_DIV);
+template <typename Place, typename T, typename functor, typename functor1,
+          typename broadcastfunctor, typename broadcast2functor>
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Input<Tensor>("Out");
+  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto place = ctx.GetEigenDevice<Place>();
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  if (dx) {
+    dx->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dy) {
+    dy->mutable_data<T>(ctx.GetPlace());
+  }
+  if (x_dims == y_dims) {
+    functor f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+  if (product(y_dims) == 1) {
+    functor1 f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    broadcastfunctor f;
+    f(place, x, y, out, dx, dy, dout, pre, n);
+    return;
+  } else {
+    broadcast2functor f;
+    f(place, x, y, out, dx, dy, dout, pre, n, post);
+    return;
+  }
+}
+class ElementwiseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  using Tensor = framework::Tensor;
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of elementwise op should not be null.");
+    auto x_dim = ctx.Input<Tensor>("X")->dims();
+    auto y_dim = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.")
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", /*->*/ "Out");
+  }
+};
+class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ElementwiseOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", R"DOC(
+The first input of elementwise op, it's a tensor of any dimensions.
+)DOC");
+    AddInput("Y", R"DOC(
+The sencond input of elementwise op, it's a tensor and it's dimensions
+must be small or equal to X's dimensions.
+)DOC");
+    AddAttr<int>("axis",
+                 R"DOC(
+When the shape(Y) does not equal the shape(X),Y will be broadcasted 
+to match the shape of X and axis should be dimension index Y in X
+        )DOC")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+    AddOutput("Out", "The output of elementwise op");
+    comment_ = R"DOC(
+Limited elementwise {name} operator.The equation is: Out = {equation}.
+1. The shape of Y should be same with X or
+2. Y's shape is a subset of X. 
+   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
+   example:
+      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input X.
+)DOC";
+    AddComment(comment_);
+  }
+ protected:
+  std::string comment_;
+  void Replace(std::string& src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+  void SetComment(std::string name, std::string equation) {
+    Replace(comment_, "{name}", name);
+    Replace(comment_, "{equation}", equation);
+  }
+};
+class ElementwiseOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  using Tensor = framework::Tensor;
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.")
+    if (x_grad) {
+      x_grad->Resize(x_dims);
+    }
+    if (y_grad) {
+      y_grad->Resize(y_dims);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/elementwise_sub_op.h"
+namespace paddle {
+namespace operators {
+class ElementwiseSubOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseSubOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Sub", "Out = X - Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
+            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_sub_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/operators/elementwise_op.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class ElementwiseSubKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
+  }
+};
+template <typename T>
+struct ElementwiseSubGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e;
+    }
+  }
+};
+template <typename T>
+struct ElementwiseSubOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e.sum();
+    }
+  }
+};
+template <typename T>
+struct ElementwiseSubBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+template <typename T>
+struct ElementwiseSubBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+template <typename Place, typename T>
+class ElementwiseSubGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
+                           ElementwiseSubOneGradFunctor<T>,
+                           ElementwiseSubBroadCastGradFunctor<T>,
+                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -186,6 +186,9 @@ W_i is a 2-D matrix of size (K x N), where N means the number of neurons
 in the fully connected layer. B is a 1-D vector of size N.
 Thus, the output Out is a 2-D matrix of size (M x N).
 Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
+All the inputs can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with first input (`X[0]`).
 )DOC");
  }
 };

--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -23,15 +23,14 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-        ctx.InputVar("Src"),
+                            "Input(X) of FillZerosLikeOp should not be null.");
-        "Input(Src) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
-    PADDLE_ENFORCE_NOT_NULL(
+                            "Output(Y) of FillZerosLikeOp should not be null.");
-        ctx.OutputVar("Dst"),
-        "Output(Dst) of FillZerosLikeOp should not be null.");
+    ctx.Output<framework::Tensor>("Y")->Resize(
+        ctx.Input<framework::Tensor>("X")->dims());
-    ctx.Output<framework::LoDTensor>("Dst")->Resize(
+    ctx.ShareLoD("X", /*->*/ "Y");
-        ctx.Input<framework::Tensor>("Src")->dims());
  }
 };
@@ -40,8 +39,8 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  FillZerosLikeOpMaker(framework::OpProto *proto,
                       framework::OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "The input of fill-zeros-like op.");
+    AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddOutput("Y", "The varibale will be filled up with zeros.");
    AddComment(R"DOC(
 Fill up a vriable with zeros.

--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output = context.Output<framework::Tensor>("Dst");
+    auto* output = context.Output<framework::Tensor>("Y");
    output->mutable_data<T>(context.GetPlace());
    auto t = framework::EigenVector<T>::Flatten(*output);
    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -35,7 +35,7 @@ class GatherOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
    framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
    output_dims[0] = batch_size;
-    ctx.Output<framework::LoDTensor>("Out")->Resize(output_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(output_dims);
  }
 };
@@ -45,7 +45,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto X_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
    auto X = ctx.Input<Tensor>("X");
    X_grad->Resize(X->dims());

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -48,7 +48,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
        ctx.OutputVar("Out"),
        "Output(Out) of GaussianRandomOp should not be null.");
-    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
    auto dims = Attr<std::vector<int>>("dims");
    std::vector<int64_t> temp;
    temp.reserve(dims.size());

--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@@ -75,9 +75,6 @@ class GemmConv2DKernel : public framework::OpKernel {
    framework::DDim output_matrix_shape = {output_channels,
                                           output_height * output_width};
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
    // convolution operator: im2col + gemm
    int in_step = input_channels / groups;
    int out_step = output_channels / groups;
@@ -87,14 +84,14 @@ class GemmConv2DKernel : public framework::OpKernel {
      for (int g = 0; g < groups; g++) {
        // im2col
        Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
-        im2col(in_slice, col, strides[0], strides[1], paddings[0], paddings[1],
+        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               device_context);
+               paddings[0], paddings[1]);
        // gemm
        Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
        Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(filter_slice, false, col_matrix, false, T(1.0),
+        math::matmul<Place, T>(context.device_context(), filter_slice, false,
-                               &out_slice, T(0.0), device_context);
+                               col_matrix, false, T(1.0), &out_slice, T(0.0));
      }
    }
  }
@@ -160,9 +157,6 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
    // convolution backward input operator:  gemm + col2im
    // convolution backward weight operator: im2col + gemm
    int in_step = input_channels / groups;
@@ -184,14 +178,15 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
          Tensor filter_slice =
              filter.Slice<T>(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(filter_slice, true, out_grad_slice, false,
+          math::matmul<Place, T>(context.device_context(), filter_slice, true,
-                                 T(1.0), &col_matrix, T(0.0), device_context);
+                                 out_grad_slice, false, T(1.0), &col_matrix,
+                                 T(0.0));
          // col2im
          Tensor in_grad_slice =
              in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
-          col2im(in_grad_slice, col, strides[0], strides[1], paddings[0],
+          col2im(context.device_context(), in_grad_slice, col, strides[0],
-                 paddings[1], device_context);
+                 strides[1], paddings[0], paddings[1]);
        }
      }
    }
@@ -212,15 +207,15 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
          Tensor out_grad_slice =
              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
          Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
-          im2col(in_slice, col, strides[0], strides[1], paddings[0],
+          im2col(context.device_context(), in_slice, col, strides[0],
-                 paddings[1], device_context);
+                 strides[1], paddings[0], paddings[1]);
          // gemm
          Tensor filter_grad_slice =
              filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(out_grad_slice, false, col_matrix, true,
+          math::matmul<Place, T>(context.device_context(), out_grad_slice,
-                                 T(1.0), &filter_grad_slice, T(1.0),
+                                 false, col_matrix, true, T(1.0),
-                                 device_context);
+                                 &filter_grad_slice, T(1.0));
        }
      }
    }

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -32,9 +32,10 @@ class LookupTableOp : public framework::OperatorWithKernel {
    auto table_t = ctx.Input<Tensor>("W");
    auto ids_t = ctx.Input<Tensor>("Ids");
-    auto output_t = ctx.Output<framework::LoDTensor>("Out");
+    auto output_t = ctx.Output<framework::Tensor>("Out");
    output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
+    ctx.ShareLoD("Ids", /*->*/ "Out");
  }
 };
@@ -50,9 +51,13 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
             "An input with type int32 or int64"
             "contains the ids to be looked up in W.");
    AddOutput("Out", "The lookup results, which have the same type with W.");
-    AddComment(
+    AddComment(R"DOC(
-        "This operator is used to perform lookups on the parameter W,"
+This operator is used to perform lookups on the parameter W,
-        "then concatenated into a dense tensor.");
+then concatenated into a dense tensor.
+The input `Ids` can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD with input `Ids`.
+)DOC");
  }
 };
@@ -64,7 +69,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  void InferShape(const framework::InferShapeContext &context) const override {
    auto table = context.Input<Tensor>("W");
    auto d_table =
-        context.Output<framework::LoDTensor>(framework::GradVarName("W"));
+        context.Output<framework::Tensor>(framework::GradVarName("W"));
    d_table->Resize(table->dims());
  }
 };

--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/lstm_unit_op.h"
+namespace paddle {
+namespace operators {
+class LstmUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of LSTM should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("C_prev"),
+                            "Input(C_prev) of LSTM should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("C"),
+                            "Output(C) of LSTM should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("H"),
+                            "Output(H) of LSTM should not be null.");
+    auto *x = ctx.Input<framework::Tensor>("X");
+    auto *c_prev = ctx.Input<framework::Tensor>("C_prev");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE(x->dims()[0] == c_prev->dims()[0],
+                   "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE(x->dims()[1] == c_prev->dims()[1] * 4,
+                   "Dimension of FC should equal to prev state * 4");
+    int b_size = c_prev->dims()[0];  // batch size
+    int s_dim = c_prev->dims()[1];   // state dim
+    ctx.Output<framework::LoDTensor>("C")->Resize({b_size, s_dim});
+    ctx.Output<framework::LoDTensor>("H")->Resize({b_size, s_dim});
+  }
+};
+template <typename AttrType>
+class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LstmUnitOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "FC input before the non-linear activation.");
+    AddInput(
+        "C_prev",
+        "The cell state tensor of last time-step in the Lstm Unit operator.");
+    AddOutput("C", "The cell tensor of Lstm Unit operator.");
+    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
+    AddComment(R"DOC(Lstm-Unit Operator
+Equation: 
+  i, f, o, j = split(X)
+  C = C_prev * sigm(f + forget_bias) + sigm(i) * tanh(j)
+  H = C * sigm(o)
+)DOC");
+    AddAttr<AttrType>("forget_bias", "The forget bias of Lstm Unit.")
+        .SetDefault(0.0);
+  }
+};
+class LstmUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("C")),
+                            "Input(C@GRAD) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("H")),
+                            "Input(H@GRAD) should not be null");
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("C_prev"))
+        ->Resize(ctx.Input<Tensor>("C_prev")->dims());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker<float>,
+            lstm_unit_grad, ops::LstmUnitGradOp);
+REGISTER_OP_CPU_KERNEL(lstm_unit,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+template <typename Dtype>
+__device__ Dtype cuda_sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+template <typename Dtype>
+__device__ Dtype cuda_tanh(const Dtype x) {
+  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
+}
+template <typename T>
+__global__ void LSTMUnitKernel(const int nthreads, const int dim,
+                               const T* C_prev, const T* X, T* C, T* H,
+                               const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const T* X_offset = X + 4 * dim * n;
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = f * c_prev + i * g;
+    C[index] = c;
+    const T tanh_c = cuda_tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+template <typename T>
+__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
+                                       const T* C_prev, const T* X, const T* C,
+                                       const T* H, const T* C_diff,
+                                       const T* H_diff, T* C_prev_diff,
+                                       T* X_diff, const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const T* X_offset = X + 4 * dim * n;
+    T* c_prev_diff = C_prev_diff + index;
+    T* X_diff_offset = X_diff + 4 * dim * n;
+    T* i_diff = X_diff_offset + d;
+    T* f_diff = X_diff_offset + 1 * dim + d;
+    T* o_diff = X_diff_offset + 2 * dim + d;
+    T* g_diff = X_diff_offset + 3 * dim + d;
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = C[index];
+    const T tanh_c = cuda_tanh(c);
+    const T c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    *c_prev_diff = c_term_diff * f;
+    *i_diff = c_term_diff * g * i * (1 - i);
+    *f_diff = c_term_diff * c_prev * f * (1 - f);
+    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
+    *g_diff = c_term_diff * i * (1 - g * g);
+  }
+}
+template <typename T, typename AttrType = T>
+class LstmUnitOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+    int block = 512;
+    int n = b_size * D;
+    int grid = (n + block - 1) / block;
+    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
+  }
+};
+template <typename T, typename AttrType = T>
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    int block = 512;
+    int n = N * D;
+    int grid = (n + block - 1) / block;
+    LSTMUnitGradientKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, C_diff,
+                                               H_diff, C_prev_diff, X_diff,
+                                               forget_bias);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>);
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using framework::LoDTensor;
+using framework::Tensor;
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+template <typename Place, typename T, typename AttrType = T>
+class LstmUnitKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+    for (int n = 0; n < b_size; ++n) {
+      for (int d = 0; d < D; ++d) {
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = f * c_prev + i * g;
+        C[d] = c;
+        const T tanh_c = tanh(c);
+        H[d] = o * tanh_c;
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+    }
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class LstmUnitGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    for (int n = 0; n < N; ++n) {
+      for (int d = 0; d < D; ++d) {
+        T* c_prev_diff = C_prev_diff + d;
+        T* i_diff = X_diff + d;
+        T* f_diff = X_diff + 1 * D + d;
+        T* o_diff = X_diff + 2 * D + d;
+        T* g_diff = X_diff + 3 * D + d;
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = C[d];
+        const T tanh_c = tanh(c);
+        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+        *c_prev_diff = c_term_diff * f;
+        *i_diff = c_term_diff * g * i * (1 - i);
+        *f_diff = c_term_diff * c_prev * f * (1 - f);
+        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+        *g_diff = c_term_diff * i * (1 - g * g);
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+      C_diff += D;
+      H_diff += D;
+      X_diff += 4 * D;
+      C_prev_diff += D;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -27,9 +27,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                    platform::CPUPlace, T> {
 public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
@@ -79,9 +80,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                    platform::CPUPlace, T> {
 public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  int stride_height, int stride_width, int padding_height,
+                  const framework::Tensor& col, int stride_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int stride_width, int padding_height, int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
    int input_channels = im.dims()[0];
@@ -137,9 +138,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                    platform::CPUPlace, T> {
 public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
    int input_channels = im.dims()[0];
@@ -197,9 +199,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                    platform::CPUPlace, T> {
 public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  int stride_height, int stride_width, int padding_height,
+                  const framework::Tensor& col, int stride_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int stride_width, int padding_height, int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
    int input_channels = im.dims()[0];

--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -64,9 +64,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                    platform::GPUPlace, T> {
 public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
@@ -84,9 +85,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
    int block_y = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(block_x, block_y);
-    im2col<T><<<
+    im2col<T><<<grid, threads, 0,
-        grid, threads, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+                    .stream()>>>(
        im.data<T>(), num_outputs, input_height, input_width, filter_height,
        filter_width, stride_height, stride_width, padding_height,
        padding_width, output_height, output_width, col.data<T>());
@@ -149,9 +150,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                    platform::GPUPlace, T> {
 public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  int stride_height, int stride_width, int padding_height,
+                  const framework::Tensor& col, int stride_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int stride_width, int padding_height, int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
@@ -174,9 +175,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
    // To avoid involving atomic operations, we will launch one kernel per
    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<
+    col2im<T><<<grid, threads, 0,
-        grid, threads, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+                    .stream()>>>(
        num_kernels, col.data<T>(), input_height + 2 * padding_height,
        input_width + 2 * padding_width, input_channels, filter_height,
        filter_width, stride_height, stride_width, padding_height,
@@ -235,9 +236,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                    platform::GPUPlace, T> {
 public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
    int input_channels = im.dims()[0];
@@ -268,9 +270,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
    dim3 threads(block_dim_x, block_dim_y,
                 std::min(block_dim_z, input_channels));
    dim3 grid(output_width, output_height);
-    im2colOCF<T><<<
+    im2colOCF<T><<<grid, threads, 0,
-        grid, threads, 0,
+                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+                       .stream()>>>(
        im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
        filter_height, filter_width, stride_height, stride_width,
        padding_height, padding_width, output_height, output_width);
@@ -318,9 +320,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                    platform::GPUPlace, T> {
 public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  int stride_height, int stride_width, int padding_height,
+                  const framework::Tensor& col, int stride_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int stride_width, int padding_height, int padding_width) {
    PADDLE_ENFORCE(im.dims().size() == 3);
    PADDLE_ENFORCE(col.dims().size() == 5);
    int input_channels = im.dims()[0];
@@ -351,9 +353,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
    dim3 threads(block_dim_x, block_dim_y,
                 std::min(block_dim_z, input_channels));
    dim3 grid(output_width, output_height);
-    col2imOCF<T><<<
+    col2imOCF<T><<<grid, threads, 0,
-        grid, threads, 0,
+                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+                       .stream()>>>(
        im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
        filter_height, filter_width, stride_height, stride_width,
        padding_height, padding_width, output_height, output_width);

--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -72,17 +72,18 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
 template <ColFormat Format, typename Place, typename T>
 class Im2ColFunctor {
 public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context);
+                  int padding_width);
 };
 template <ColFormat Format, typename Place, typename T>
 class Col2ImFunctor {
 public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  int stride_height, int stride_width, int padding_height,
+                  const framework::Tensor& col, int stride_height,
-                  int padding_width, platform::DeviceContext* context);
+                  int stride_width, int padding_height, int padding_width);
 };
 }  // namespace math

--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -78,8 +78,8 @@ void testIm2col() {
    PADDLE_THROW("no GPU support");
 #endif  // PADDLE_ONLY_CPU
  }
-  im2col(input, output_cfo, stride, stride, padding, padding, context);
+  im2col(*context, input, output_cfo, stride, stride, padding, padding);
-  im2col_ocf(input, output_ocf, stride, stride, padding, padding, context);
+  im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding);
  float* out_cfo_ptr;
  if (paddle::platform::is_cpu_place(*place)) {

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -48,6 +48,32 @@ void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
              beta, C, ldc);
 }
+template <>
+void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
+                                     const bool transA, const bool transB,
+                                     const int M, const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const int lda, const float* B,
+                                     const int ldb, const float beta, float* C,
+                                     const int ldc) {
+  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+template <>
+void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
+                                      const bool transA, const bool transB,
+                                      const int M, const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const int lda, const double* B,
+                                      const int ldb, const double beta,
+                                      double* C, const int ldc) {
+  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
 template <>
 void matmul<platform::CPUPlace, float>(
    const platform::DeviceContext& context, const framework::Tensor& matrix_a,

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -63,6 +63,42 @@ void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
+template <>
+void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
+                                     const bool transA, const bool transB,
+                                     const int M, const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const int lda, const float* B,
+                                     const int ldb, const float beta, float* C,
+                                     const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+}
+template <>
+void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
+                                      const bool transA, const bool transB,
+                                      const int M, const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const int lda, const double* B,
+                                      const int ldb, const double beta,
+                                      double* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+}
 template <>
 void matmul<platform::GPUPlace, float>(
    const platform::DeviceContext& context, const framework::Tensor& matrix_a,

--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -70,6 +70,13 @@ void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
          const T alpha, const T* A, const T* B, const T beta, T* C);
+// gemm wrapper with stride args for matrix uncontinuous in memory
+template <typename Place, typename T>
+void gemm(const platform::DeviceContext& context, const bool transA,
+          const bool transB, const int M, const int N, const int K,
+          const T alpha, const T* A, const int lda, const T* B, const int ldb,
+          const T beta, T* C, const int ldc);
 // matrix multiply with continuous memory
 template <typename Place, typename T>
 void matmul(const platform::DeviceContext& context,

--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -72,4 +72,174 @@ TEST(math_function, trans_mul_notrans) {
  EXPECT_EQ(out_ptr[8], 29);
  delete gpu_place;
 }
+TEST(math_function, gemm_notrans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+TEST(math_function, gemm_trans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
 #endif
+TEST(math_function, gemm_notrans_cblas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
+      input3_ptr + 1, 4);
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
+TEST(math_function, gemm_trans_clbas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
+      input3_ptr + 1, 4);
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -27,7 +27,7 @@ class MeanOp : public framework::OperatorWithKernel {
                            "Input(X) of MeanOp should not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                            "Output(Out) of MeanOp should not be null.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize({1});
+    ctx.Output<framework::Tensor>("Out")->Resize({1});
  }
 };
@@ -37,7 +37,8 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
    AddOutput("Out", "The output of mean op").NotInGradient();
-    AddComment("Mean Operator");
+    AddComment(R"DOC( Mean Operator
+)DOC");
  }
 };
@@ -47,7 +48,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -40,7 +40,8 @@ class MinusOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(
        left_tensor->numel(), right_tensor->numel(),
        "Minus operator must take two tensor with same num of elements");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(left_tensor->dims());
+    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
+    ctx.ShareLoD("X", /*->*/ "Out");
  }
 };
@@ -54,7 +55,12 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(Minus Operator
-Equation: Out = X - Y
+Equation:
+    Out = X - Y
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
  }
 };

--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -34,8 +34,8 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "The tensor rank of X must be 2.");
    PADDLE_ENFORCE_EQ(x->dims()[1], 1, "The 2nd dimension of X must be 1.");
-    context.Output<framework::LoDTensor>("IntermediateVal")->Resize(x->dims());
+    context.Output<framework::Tensor>("IntermediateVal")->Resize(x->dims());
-    context.Output<framework::LoDTensor>("Out")->Resize({x->dims()[0], 1});
+    context.Output<framework::Tensor>("Out")->Resize({x->dims()[0], 1});
  }
 };
@@ -81,7 +81,7 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
    auto* intermediate_val = context.Input<Tensor>("IntermediateVal");
    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* x_grad =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
    PADDLE_ENFORCE_NOT_NULL(x, "X must be initialized.");
    PADDLE_ENFORCE_NOT_NULL(y, "Y must be initialized.");

--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -52,8 +52,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("X");
    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<framework::LoDTensor>("IntermediateVal");
+    auto* out0 = context.Output<framework::Tensor>("IntermediateVal");
-    auto* out1 = context.Output<framework::LoDTensor>("Out");
+    auto* out1 = context.Output<framework::Tensor>("Out");
    out0->mutable_data<T>(context.GetPlace());
    out1->mutable_data<T>(context.GetPlace());
@@ -77,11 +77,9 @@ class ModifiedHuberLossGradCPUKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<framework::LoDTensor>("IntermediateVal");
+    auto* in1 = context.Input<framework::Tensor>("IntermediateVal");
-    auto* in2 =
+    auto* in2 = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* out0 =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
    if (out0) {
      const T* y_ptr = in0->data<T>();

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
-using framework::LoDTensor;
 class MulOp : public framework::OperatorWithKernel {
 public:
@@ -53,8 +52,9 @@ class MulOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(
        x_mat_dims[1], y_mat_dims[0],
        "First matrix's width must be equal with second matrix's height.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
        {x_mat_dims[0], y_mat_dims[1]});
+    ctx.ShareLoD("X", /*->*/ "Out");
  }
 };
@@ -83,9 +83,14 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(1)
        .EqualGreaterThan(1);
    AddComment(R"DOC(
-Two Element Mul Operator.
+Mul operator is used to perform matrix multiplication for input X and Y.
-The equation is: Out = X * Y
+The equation is:
+    Out = X * Y
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
  }
 };
@@ -103,10 +108,8 @@ class MulOpGrad : public framework::OperatorWithKernel {
    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto y_dims = ctx.Input<Tensor>("Y")->dims();
    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
    auto x_mat_dims =
        framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -39,8 +39,13 @@ class PadOp : public framework::OperatorWithKernel {
    for (int i = 0; i < x_dim.size(); ++i) {
      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
    }
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
        framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx.ShareLoD("X", /*->*/ "Out");
+    }
  }
 };
@@ -101,7 +106,7 @@ class PadOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                            "Input(Out@GRAD) should not be null");
    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_g = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *x_g = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
    if (x_g != nullptr) {
      x_g->Resize(x_dims);
    }

--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -40,7 +40,7 @@ class RankLossOp : public framework::OperatorWithKernel {
                   "All inputs must have the same size");
    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
                   "All inputs must be row vector with size batch_size x 1.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(label_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(label_dims);
  }
 };
@@ -102,9 +102,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
                            "Input(Out@GRAD) shouldn't be null.");
    auto dims = ctx.Input<framework::Tensor>("Left")->dims();
    auto *left_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Left"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
    auto *right_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Right"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
    if (left_grad) {
      left_grad->Resize(dims);
    }

--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/sequence_avg_pool_op.cc
+++ b/paddle/operators/sequence_avg_pool_op.cc
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
--- a/paddle/operators/sequence_avg_pool_op.h
+++ b/paddle/operators/sequence_avg_pool_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_div_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_multiplex_op.py
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py