diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 1ac47212b5a75667e8e9d4465b33f575516e2836..4703944c8722552d56ba80a8e0663de5fb4df53d 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 def process(settings, file_list):
     for i in xrange(1024):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class)
+        lab = random.randint(0, settings.num_class - 1)
         yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b6cd6fe03b381d2b6529116f934ce7ce03d63546
--- /dev/null
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -0,0 +1,48 @@
+set -e
+
+unset OMP_NUM_THREADS MKL_NUM_THREADS
+export OMP_DYNAMIC="FALSE"
+export KMP_AFFINITY="granularity=fine,compact,0,0"
+
+function train() {
+  topology=$1
+  bs=$2
+  use_mkldnn=$3
+  if [ $3 == "True" ]; then
+    thread=1
+    log="logs/${topology}-mkldnn-${bs}.log"
+  elif [ $3 == "False" ]; then
+    thread=`nproc`
+    log="logs/${topology}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $3, use True or False."
+  fi
+  args="batch_size=${bs}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+}
+
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#========== mkldnn ==========#
+train vgg 64 True
+train vgg 128 True
+train vgg 256 True
+
+#========== mklml ===========#
+train vgg 64 False
+train vgg 128 False
+train vgg 256 False
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8429975f5c83df6996e71478fe276b246e8b77b
--- /dev/null
+++ b/benchmark/paddle/image/vgg.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg('layer_num', int, 19)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def vgg_network(vgg_num=3):
+    tmp = img_conv_group(
+        input=img,
+        num_channels=3,
+        conv_padding=1,
+        conv_num_filter=[64, 64],
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_size=2,
+        pool_stride=2,
+        pool_type=MaxPooling())
+
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[128, 128],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    channels = []
+    for i in range(vgg_num):
+        channels.append(256)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(512)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 16:
+    vgg = vgg_network(3)
+elif layer_num == 19:
+    vgg = vgg_network(4)
+else:
+    print("Wrong layer number.")
+
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=vgg, label=lab)
+outputs(loss)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d2aab938d4636b1583062e27b73cb30f5d56b7b0..ff9868fc4e0d970b11e4763d2e0c8581f4f85907 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
   endforeach()
   list(REMOVE_DUPLICATES libs_deps)
 
-  if(APPLE) # Use OSX's libtool to merge archives
-    # To produce a library we need at least one source file.
-    # It is created by add_custom_command below and will helps
-    # also help to track dependencies.
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
+  if(APPLE) # Use OSX's libtool to merge archives
     # Make the generated dummy source file depended on all static input
     # libs. If input lib changes,the source file is touched
     # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${dummyfile}
-      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
       DEPENDS ${libs})
 
     # Generate dummy staic lib
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
     foreach(lib ${libs})
@@ -130,11 +130,14 @@ function(merge_static_libs TARGET_NAME)
     endforeach()
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      )
   else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
+
     foreach(lib ${libs})
-      set(objlistfile ${lib}.objlist) # list of objects in the input library
-      set(objdir ${lib}.objdir)
+      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
+      set(objdir ${target_DIR}/${lib}.objdir)
 
       add_custom_command(OUTPUT ${objdir}
         COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
@@ -142,31 +145,32 @@ function(merge_static_libs TARGET_NAME)
 
       add_custom_command(OUTPUT ${objlistfile}
         COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library		
-      set(mergebase ${lib}.mergebase.c)		
-      add_custom_command(OUTPUT ${mergebase}		
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
-        DEPENDS ${objlistfile})		
-
-      list(APPEND mergebases "${mergebase}")
+      list(APPEND target_OBJS "${objlistfile}")
     endforeach()
 
-    add_library(${TARGET_NAME} STATIC ${mergebases})
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
     # Get the file name of the generated library
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
 
-    foreach(lib ${libs})
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
-        COMMAND ${CMAKE_RANLIB} ${outlibfile}
-        WORKING_DIRECTORY ${lib}.objdir)
-    endforeach()
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
+        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
+        WORKING_DIRECTORY ${target_DIR})
   endif()
 endfunction(merge_static_libs)
 
@@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
     add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
 
   else(cc_library_SRCS)
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
     else()
       message(FATAL "Please specify source file or library in cc_library.")
@@ -249,7 +253,7 @@ function(nv_library TARGET_NAME)
       foreach(source_file ${nv_library_SRCS})
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
       add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
diff --git a/cmake/util.cmake b/cmake/util.cmake
index ac911052eb970c5a3e485e3178dd788b1517ca30..d1aee3e170a2d143ac06b438725e907e96f041c8 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -97,6 +97,10 @@ function(link_paddle_exe TARGET_NAME)
         target_link_libraries(${TARGET_NAME} log)
     endif(ANDROID)
 
+    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+    endif()
+
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac7e98ccf1aadbb973a4801fde842375cf63448c
--- /dev/null
+++ b/doc/design/refactor/distributed_architecture.md
@@ -0,0 +1,222 @@
+# Design Doc: Distributed Training Architecture
+
+## Abstract
+
+PaddlePaddle v0.10.0 uses the "trainer-parameter server"
+architecture. We run multiple replicated instances of trainers (runs
+the same code written by the user) and parameter servers for
+distributed training. This architecture served us well, but has some
+limitations:
+
+1. Need to write special code to handle tasks which should only be run
+  by a single trainer. E.g., initializing model and saving model.
+
+2. Model parallelism is hard: need to write if-else branches conditioned
+  on the trainer ID to partition model onto each trainer, and manually
+  write the inter-model-shard communication code.
+
+3. The user can not directly specify the parameter update rule: need
+   to modify the parameter server C++ code and compile a new
+   binary. This adds complication for researchers: A lot of extra
+   effort is required. Besides, the training job submission program
+   may not allow running arbitrary binaries.
+
+This design doc discusses PaddlePaddle's new distributed training
+architecture that addresses the above limitations.
+
+## Analysis
+
+We will assume the user writes the trainer program by Python, the same
+analysis holds if the trainer program is written in C++.
+
+### Limitation 1
+
+If we look at the Python code that the user writes, there are two
+kinds of functionalities:
+
+- The training logic such as load / save model and print log.
+- The neural network definition such as the definition of the data
+  layer, the fully connected layer, the cost function and the
+  optimizer.
+
+When we training with PaddlePaddle v0.10.0 distributedly, multiple
+replicated Python instances are running on different nodes: both the
+training logic and the neural network computation is replicated.
+
+The tasks that should only run once all belong to the training logic,
+if we only replicate the neural network computation, but do **not**
+replicate the training logic, the limitation could be solved.
+
+### Limitation 2
+
+Model parallelism means running a single model on multiple nodes by
+partitioning the model onto different nodes and managing the
+inter-model-shard communications.
+
+PaddlePaddle should be able to modify the nerual network computation
+definition to support model parallelism automatically. However, the
+computation is only specified in Python code, and PaddlePaddle can not
+modify Python code.
+
+Just like compiler uses a intermediate representation (IR) so that
+programmer does not need to manually optimize their code in most of
+the cases - the compiler will optimize the IR:
+
+<img src="src/compiler.png"/>
+
+We can have our own IR too: PaddlePaddle can support model parallel by
+converting the IR so the user no longer need to manually do it in
+Python:
+
+<img src="src/paddle-compile.png"/>
+
+The IR for PaddlePaddle after refactor is called `Block`, it specifies
+the computation dependency graph and the variables used in the
+computation.
+
+### Limitation 3
+
+The user can not directly specify the parameter update rule for the
+parameter server because the parameter server does not use the same
+computation definition as the trainer. Instead, the update rule is
+baked in the parameter server. The user can not specify the update
+rule in the same way of specifying the trainer computation.
+
+This could be fixed by making the parameter server run the same
+computation definition as the trainer. For a detailed explanation,
+please
+see
+[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+
+## Distributed Training Architecture
+
+The new distributed training architecture can address the above
+limitations. Below is the illustration:
+
+<img src="src/distributed_architecture.png"/>
+
+The architecture includes major components: *PaddlePaddle Python*,
+*PaddlePaddle converter* and *PaddlePaddle runtime*:
+
+### PaddlePaddle Python
+
+PaddlePaddle Python is the Python library that user's Python trainer
+invoke to build the neural network topology, start training, etc.
+
+```Python
+paddle.init()
+input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
+img, label = input[0], input[1]
+hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
+prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
+cost = paddle.layer.classification_cost(input=prediction, label=label)
+optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
+session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
+for i in range(1000):
+	_, cost_val = session.eval(targets=[cost, optimizer])
+	print cost_val
+```
+
+The code above is a typical Python trainer code, the neural network
+topology is built using helper functions such as
+`paddle.layer.fc`. The training is done by calling `session.eval`
+iteratively.
+
+#### session.eval
+
+As shown in the graph, `session.eval` sends the IR and the evaluation
+inputs/targets to the PaddlePaddle cluster for evaluation. The
+targets can be any variable in the computation graph. When the target
+is the `optimizer` variable, the neural network will be optimized
+once. When the target is the `cost` variable, `session.eval` returns
+the cost value.
+
+The Python `session` is a wrapper of the C++ `Session` class. For more
+information about `Session`, please
+see [Design Doc: Session](./session.md).
+
+### PaddlePaddle Converter
+
+PaddlePaddle converter automatically converts the IR in the request
+(IR and evaluation inputs/targets) from PaddlePaddle Python to new
+partitioned IRs and dispatch the new IRs and evaluation inputs/targets
+to different PaddlePaddle runtimes. Below are the steps:
+
+1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
+   fetches the eval targets to the IR.
+
+1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
+   the boundary. The runtime does not need to run the OP that is not
+   dependent by the `fetch` OP.
+
+1. Optimizes the computation graph.
+
+1. Place the OPs in the graph onto different devices on different
+   PaddlePaddle runtime according to a placement algorithm and device
+   constraint specified by the user.
+
+1. Partition the graph according to runtime boundaries and add `send` /
+   `recv` OP pair on the runtime boundaries.
+
+1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
+   results back to the converter, the convert reports the evaluation
+   results back to the PaddlePaddle Python.
+   
+The output IRs will be cached to optimize the conversion latency.
+
+
+#### Placement Algorithm
+
+Our first implementation will only support "trainer-parameter server"
+placement: the parameters, initializers, and optimizers are placed on
+the PaddlePaddle runtimes with the parameter server role. And
+everything else will be placed on the PaddlePaddle runtimes with the
+trainer role. This has the same functionality of our
+"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
+is more general and flexible.
+
+In the future, we will implement the general placement algorithm,
+which makes placements according to the input IR, and a model of
+device computation time and device communication time. Model
+parallelism requires the general placement algorithm.
+
+
+### PaddlePaddle Runtime
+
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
+runs the IR. The runtime does not need to do OP placement since it's
+already done by the converter.
+
+
+### Local Training Architecture
+
+The local training architecture will be the same as the distributed
+training architecture, the differences are everything runs locally,
+and there is just one PaddlePaddle runtime:
+
+<img src="src/local_architecture.png"/>
+
+
+### Training Data
+
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `session.eval`. However should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+
+
+## References:
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/design/ops/dist_train.md b/doc/design/refactor/parameter_server.md
similarity index 100%
rename from doc/design/ops/dist_train.md
rename to doc/design/refactor/parameter_server.md
diff --git a/doc/design/refactor/src/compiler.graffle b/doc/design/refactor/src/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/design/refactor/src/compiler.graffle differ
diff --git a/doc/design/refactor/src/compiler.png b/doc/design/refactor/src/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/design/refactor/src/compiler.png differ
diff --git a/doc/design/ops/src/dist-graph.graffle b/doc/design/refactor/src/dist-graph.graffle
similarity index 100%
rename from doc/design/ops/src/dist-graph.graffle
rename to doc/design/refactor/src/dist-graph.graffle
diff --git a/doc/design/ops/src/dist-graph.png b/doc/design/refactor/src/dist-graph.png
similarity index 100%
rename from doc/design/ops/src/dist-graph.png
rename to doc/design/refactor/src/dist-graph.png
diff --git a/doc/design/refactor/src/distributed_architecture.graffle b/doc/design/refactor/src/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..f8496e57326c38de7468eb452a7713291d57653c
Binary files /dev/null and b/doc/design/refactor/src/distributed_architecture.graffle differ
diff --git a/doc/design/refactor/src/distributed_architecture.png b/doc/design/refactor/src/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..410c4510c6aab301dec95e6427fe80ac24e105fe
Binary files /dev/null and b/doc/design/refactor/src/distributed_architecture.png differ
diff --git a/doc/design/ops/src/local-graph.graffle b/doc/design/refactor/src/local-graph.graffle
similarity index 100%
rename from doc/design/ops/src/local-graph.graffle
rename to doc/design/refactor/src/local-graph.graffle
diff --git a/doc/design/ops/src/local-graph.png b/doc/design/refactor/src/local-graph.png
similarity index 100%
rename from doc/design/ops/src/local-graph.png
rename to doc/design/refactor/src/local-graph.png
diff --git a/doc/design/refactor/src/local_architecture.graffle b/doc/design/refactor/src/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cc7783c45381f25ded0b898649322c81418ad317
Binary files /dev/null and b/doc/design/refactor/src/local_architecture.graffle differ
diff --git a/doc/design/refactor/src/local_architecture.png b/doc/design/refactor/src/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..4b999538b7825c805292ee28b5e3256d5543bd09
Binary files /dev/null and b/doc/design/refactor/src/local_architecture.png differ
diff --git a/doc/design/refactor/src/paddle-compile.graffle b/doc/design/refactor/src/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/design/refactor/src/paddle-compile.graffle differ
diff --git a/doc/design/refactor/src/paddle-compile.png b/doc/design/refactor/src/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/design/refactor/src/paddle-compile.png differ
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index acbf4c87ae5242f6cfc593a7fddc649ee3a70d7c..b3ecfba791ead8349ded018a30059b03eacbdacd 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -390,4 +390,125 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
 
 * 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
 
-* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
\ No newline at end of file
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
+
+19. PaddlePaddle如何输出多个层
+------------------------------
+
+* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, flatten_result=False, field=["value"])
+
+这里设置 :code:`flatten_result=False`，得到的输出结果是元素个数等于输出字段数的 :code:`list`，该 :code:`list` 的每个元素是由所有输出层相应字段结果组成的 :code:`list`，每个字段结果的类型是 :code:`numpy.array`。:code:`flatten_result` 的默认值为 :code:`True`，该情况下，PaddlePaddle会分别对每个字段将所有输出层的结果按行进行拼接，如果各输出层该字段 :code:`numpy.array` 结果的相应维数不匹配，程序将不能正常运行。
+
+20. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
+-------------------------------------------------------------
+
+* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
+
+* PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
+
+21. dropout 使用
+-----------------
+
+* 在PaddlePaddle中使用dropout有两种方式
+
+  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
+
+* PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
+
+* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
+
+22. 如何设置学习率退火（learning rate annealing）
+------------------------------------------------
+
+在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=0.75,
+        learning_rate_schedule="poly",)
+
+PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
+
+* "constant"
+
+  lr = learning_rate
+
+* "poly"
+
+  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  其中，num_samples_processed为已训练样本数，下同。
+
+* "caffe_poly"
+
+  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
+
+* "pass_manual"
+
+  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1:1.0,2:0.9,3:0.8",) 
+
+  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
+
+23. 出现 :code:`Duplicated layer name` 错误怎么办
+--------------------------------------------------
+
+出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
+
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 84e33177740ca1652efc09c8081c2519b4366906..30b144d849bec367cd0197b6082889e011193a9a 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -20,7 +20,7 @@ Docker使用入门
 		  
      docker pull paddlepaddle/paddle:0.10.0
 
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
+  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
 
 - *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
   实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7aa501db9e5c7378398fad48503f82bff893b60
--- /dev/null
+++ b/doc/howto/dev/new_op_en.md
@@ -0,0 +1,235 @@
+# How to write a new operator
+
+ - [Background](#Background)
+ - [Implementing C++ Types](#Implementing_C++_Types)
+   - [Defining ProtoMaker](#Defining_ProtoMaker)
+   - [Defining Operator](#Defining_Operator)
+   - [Registering Operator](#Registering_Operator)
+   - [Compilation](#Compilation)
+ - [Python Binding](#Python_Binding)
+ - [Unit Tests](#Unit_Tests)
+
+## Background
+
+Here are the base types needed. For details, please refer to the design docs.
+
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
+- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+
+An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+
+
+ Information           | Where is it defined
+--------------  | :----------------------
+OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
+Op definition           | `.cc` files
+Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+
+
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
+
+
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+
+
+## Implementing C++ Types
+
+
+### 1. Defining Class ProtoMaker
+
+Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
+
+First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+
+   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
+   - `framework::OpAttrChecker` is used to validate variable attributes.
+
+The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
+
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+
+
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+
+There are two changes in this example:
+
+- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
+
+- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+
+
+### 2. Defining Operator
+
+The following code defines the interface for MulOp:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
+
+  - 1). validate and error out early: it checks input data dimensions and types.
+  - 2). configures the tensor shape in the output.
+
+Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
+
+### 3. Defining OpKernel
+
+`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
+
+- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+
+- `typename T` denotes data type, such as `float` or `double`.
+
+`MulKernel` types need to rewrite the interface for `Compute`.
+- `Compute` takes one input variable `const framework::ExecutionContext& context`.
+- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
+- `Compute` implements the computation logics of an `OpKernel`.
+
+`MulKernel`'s implementation of `Compute` is as follows:
+
+  ```cpp
+  template <typename Place, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+
+`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, `Eigen unsupported Tensor` module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+
+
+This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
+
+The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
+
+### 4. Registering Operator
+
+- In `.cc` files, register forward and backward operator classes and the CPU kernel.
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+    ```
+
+   In that code block,
+
+    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+
+
+- Registering GPU Kernel in `.cu` files
+    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_GPU_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    ```
+
+### 5. Compilation
+
+Run the following commands to compile.
+
+```
+make mul_op
+```
+
+## Python Binding
+
+The system will automatically bind to Python and link it to a generated library.
+
+## Unit Tests
+
+Unit tests include comparing a forward operator's implementations on different devices, comparing a backward operator's implementation on different devices, and a scaling test for the backward operator. Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..1cd9962700bb49866f1ed6987abc28b27888a23f
--- /dev/null
+++ b/doc/survey/cluster_bootstrapping_tools.md
@@ -0,0 +1,71 @@
+# Cluster bootstrapping tool survey
+## Abstract
+In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
+
+## Basic assumptions
+Here are some basic assumptions before we move on to  details
+1. You are an administrator of a bare metal machine cluster, which means:
+  * you have full control to each of the machines.
+  * you have full control to the network which machines are connected to.
+2. Machines can be booted from network with PEX or iPXE
+3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
+
+if your cluster is able to mark above items with checkmarks, then keep reading.
+
+## Comparing Sextant and Tectonic installer
+### Sextant
+Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. 
+
+#### Pros
+1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
+2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
+3. docker registry integrated.
+4. GPU machine took care of.
+
+### Cons
+1. k8s API server is not deployed with high availability in considering by default.
+2. No grouping support.
+3. No API interface, a one-off service.
+
+
+### Tectonic installer
+First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
+
+Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, 
+Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
+
+Matchbox's Approach is similar to Sexstant.
+
+### Pros
+1. supports grouping machines.
+2. supports running provisioning service in rtk. (not a big deal though).
+3. supports http/gRPC API interface.
+4. supports multi-template.
+
+### Cons
+1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
+2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
+
+## Conclusion
+Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
+
+
+
+## Appendix: General procedure to bring up a cluster
+It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
+1. setup a bootstrap machine with static IP in the cluster, which has following services:
+  * DHCP: assigns ip address for rest of the nodes.
+  * name service: to map node name to a IP
+  * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. 
+  * cluster config service: this is for providing cluster node with OS config via http
+  * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
+2. New node powers on, it will
+  * broadcast the request for an IP address
+  * DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
+  * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
+  * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
+  * then restart the node.
+
+For further understanding, following 2 links from Matchbox are some good readings:
+* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
+* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index dd9e4f1cbd636e29a6934d1119fc93ebc9d0ecee..b9bbe58951c643f1b1649858880fbd2ba3a2a7b7 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -62,6 +62,7 @@ if(ANDROID)
           LIBRARY DESTINATION lib/${ANDROID_ABI})
   execute_process(
     COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_COMMITS_LIST
     RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -81,8 +82,7 @@ if(ANDROID)
       )"
   )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole
-          ARCHIVE DESTINATION lib)
+  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
   if(NOT IOS)
     install(TARGETS paddle_capi_shared DESTINATION lib)
   endif()
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index 159ed03b92bbc57ab79734de832845ef1f367de9..510dc28c57f642786e7c64d86961c76ac80014a8 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -28,41 +28,11 @@ ProgramDesc& GetProgramDesc() {
   return *g_program_desc;
 }
 
-template <>
-AttrType AttrTypeID<int>() {
-  return INT;
-}
-template <>
-AttrType AttrTypeID<float>() {
-  return FLOAT;
-}
-template <>
-AttrType AttrTypeID<std::string>() {
-  return STRING;
-}
-template <>
-AttrType AttrTypeID<std::vector<int>>() {
-  return INTS;
-}
-template <>
-AttrType AttrTypeID<std::vector<float>>() {
-  return FLOATS;
-}
-template <>
-AttrType AttrTypeID<std::vector<std::string>>() {
-  return STRINGS;
-}
-template <>
-AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
-  return INT_PAIRS;
-}
-template <>
-AttrType AttrTypeID<BlockDesc>() {
-  return BLOCK;
-}
-
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
+    case framework::AttrType::BOOLEAN: {
+      return attr_desc.b();
+    }
     case framework::AttrType::INT: {
       return attr_desc.i();
     }
@@ -72,6 +42,13 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
     case framework::AttrType::STRING: {
       return attr_desc.s();
     }
+    case framework::AttrType::BOOLEANS: {
+      std::vector<bool> val(attr_desc.bools_size());
+      for (int i = 0; i < attr_desc.bools_size(); ++i) {
+        val[i] = attr_desc.bools(i);
+      }
+      return val;
+    }
     case framework::AttrType::INTS: {
       std::vector<int> val(attr_desc.ints_size());
       for (int i = 0; i < attr_desc.ints_size(); ++i) {
@@ -93,14 +70,6 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
       }
       return val;
     }
-    case framework::AttrType::INT_PAIRS: {
-      std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
-      for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
-        val[i].first = attr_desc.int_pairs(i).first();
-        val[i].second = attr_desc.int_pairs(i).second();
-      }
-      return val;
-    }
     case framework::AttrType::BLOCK: {
       return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
     }
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 6735ca0a8d70d04dae9dd610742370e83472e9b0..488fa38faf12ee51087643f79295f36bfd33ee22 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,9 +27,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// The order should be as same as framework.proto
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>,
-                       std::vector<std::pair<int, int>>, BlockDesc*>
+                       std::vector<float>, std::vector<std::string>, bool,
+                       std::vector<bool>, BlockDesc*>
     Attribute;
 
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
@@ -37,7 +38,10 @@ typedef std::unordered_map<std::string, Attribute> AttributeMap;
 ProgramDesc& GetProgramDesc();
 
 template <typename T>
-AttrType AttrTypeID();
+inline AttrType AttrTypeID() {
+  Attribute tmp = T();
+  return static_cast<AttrType>(tmp.which() - 1);
+}
 
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
 
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 89a49f69062486ace67154f52450e7449a948851..951c7afbc14e2d9119169c1351d38ff0b67bdc5b 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -22,15 +22,11 @@ enum AttrType {
   INTS = 3;
   FLOATS = 4;
   STRINGS = 5;
-  INT_PAIRS = 6;
-  BLOCK = 7;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
 }
 
-message IntPair {
-  required int32 first = 1;
-  required int32 second = 2;
-};
-
 // OpDesc describes an instance of a C++ framework::OperatorBase
 // derived class type.
 message OpDesc {
@@ -44,8 +40,9 @@ message OpDesc {
     repeated int32 ints = 6;
     repeated float floats = 7;
     repeated string strings = 8;
-    repeated IntPair int_pairs = 9;
-    optional int32 block_idx = 10;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
   };
 
   message Var {
@@ -102,7 +99,7 @@ enum DataType {
 
 message LoDTensorDesc {
   required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
   optional int32 lod_level = 3 [ default = 0 ];
 }
 
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 908a1f2fd0abe0aa4016c72dbcbc18dcc144232c..3c349637cdbe59b2cf9a1ea28e7715f4181f9293 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -72,20 +72,16 @@ bool operator==(const LoD& a, const LoD& b) {
   return true;
 }
 
-void LoDTensor::SliceLevels(size_t level_begin, size_t level_end) {
+void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
   auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
   lod_ = new_lod;
 }
 
-void LoDTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
-  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
-                 NumLevels());
-  PADDLE_ENFORCE(elem_begin < NumElements(level),
-                 "element begin [%d] out of range [%d]", elem_begin,
-                 NumElements(level));
-  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
-                 "element end [%d] out of range [%d]", elem_end,
-                 NumElements(level));
+void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
+                              size_t elem_end) {
+  PADDLE_ENFORCE_LT(level, NumLevels());
+  PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
+  PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
 
   auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
   lod_ = new_lod;
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index fac5cd20aa7f9db0792f8102bb442192ab1ad63f..82f58464264c6871b51251e0feae3d5ca076cd2b 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -89,15 +89,15 @@ class LoDTensor : public Tensor {
   }
 
   /*
-   * Slice of levels[level_begin:level_end]
+   * Shrink levels[level_begin:level_end]
    */
-  void SliceLevels(size_t level_begin, size_t level_end);
+  void ShrinkLevels(size_t level_begin, size_t level_end);
 
   /*
-   * Slice of elements of a level, [elem_begin: elem_end]
+   * Shrink elements of a level, [elem_begin: elem_end]
    * @note: low performance in slice lod_.
    */
-  void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
+  void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
  private:
   LoD lod_;
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 7915326b27a22e9280e3f09d9bbfc2a58f46aff7..486b839738ec077545163bc47e6a97ef188c3c2f 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -56,11 +56,11 @@ TEST_F(LoDTensorTester, NumElements) {
   ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }
 
-TEST_F(LoDTensorTester, SliceLevels) {
+TEST_F(LoDTensorTester, ShrinkLevels) {
   // slice 1 level
   for (size_t level = 0; level < 3UL; ++level) {
     LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.SliceLevels(level, level + 1);
+    new_lod_tensor.ShrinkLevels(level, level + 1);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
     ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
     ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
@@ -68,7 +68,7 @@ TEST_F(LoDTensorTester, SliceLevels) {
   // slice 2 level
   for (size_t level = 0; level < 2UL; ++level) {
     LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.SliceLevels(level, level + 2);
+    new_lod_tensor.ShrinkLevels(level, level + 2);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
     ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
     ASSERT_EQ(new_lod_tensor.NumElements(1),
@@ -77,10 +77,10 @@ TEST_F(LoDTensorTester, SliceLevels) {
   }
 }
 
-TEST_F(LoDTensorTester, SliceInLevel) {
+TEST_F(LoDTensorTester, ShrinkInLevel) {
   size_t level = 0;
   LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.SliceInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 2);
   EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
   EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
   EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
@@ -89,7 +89,7 @@ TEST_F(LoDTensorTester, SliceInLevel) {
 
   level = 1;
   new_lod_tensor = lod_tensor_;
-  new_lod_tensor.SliceInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 2);
   ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
   ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
   ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index a75fd49a470fa5f9ce7a47ecd03c1084f9b48c27..fcbfc3e4377edd0ea55c8d4328c325fa18663001 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -60,8 +60,8 @@ std::string OperatorBase::Output(const std::string& name) const {
 const std::vector<std::string>& OperatorBase::Outputs(
     const std::string& name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output %s", type_,
-                 name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output called %s",
+                 type_, name);
   return it->second;
 }
 
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 4b5a2ae523f2f7fde5445f0534cd99969ad9d59e..f040c09c089ec75c9773d752685be5e232e8f4b7 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,16 +29,19 @@ limitations under the License. */
 
 namespace paddle {
 
-namespace framework {
+namespace pybind {
 namespace details {
 template <bool less, size_t i, typename... args>
 struct CastToPyBufferImpl;
 }
+}  // namespace pybind
+
+namespace framework {
 
 class Tensor {
  public:
   template <bool less, size_t i, typename... args>
-  friend struct details::CastToPyBufferImpl;
+  friend struct pybind::details::CastToPyBufferImpl;
 
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
@@ -165,12 +168,6 @@ class Tensor {
   /*! points to dimensions of memory block. */
   DDim dims_;
 
-  /**
-   * A cache of the number of elements in a tensor.
-   * Would be 0 for an uninitialized tensor.
-   */
-  int64_t numel_;
-
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
    *
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 6d2c14f4c47afb755b1c74f6dc4dd10ab25ed191..a5405f9c31543b5733f9db923c2a6f8b968cfc2d 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -147,13 +147,12 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
 
 inline Tensor& Tensor::Resize(const DDim& dims) {
   dims_ = dims;
-  numel_ = product(dims_);
   return *this;
 }
 
 inline const DDim& Tensor::dims() const { return dims_; }
 
-inline int64_t Tensor::numel() const { return numel_; }
+inline int64_t Tensor::numel() const { return product(dims_); }
 
 template <typename T>
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index ac50937ef3e28c1ac5aae651f9cf266ad07abcc4..18c5638100065109fba1f0647a1c5f91256f7b9d 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -27,31 +27,53 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
 #define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
 
 /**
- * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ * @def BEGIN_MKLDNN_ACTIVATION
+ */
+#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
+/**
+ * @def END_MKLDNN_ACTIVATION
  */
-#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(ACT_TYPE, ALPHA, BWD_ALPHA)        \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)                              \
-      : public MKLDNNEltwiseActivation {                                    \
-  private:                                                                  \
-    static const std::string name;                                          \
-    static const float alpha;                                               \
-    static const float bwdAlpha;                                            \
-                                                                            \
-  public:                                                                   \
-    const std::string& getName() const { return name; }                     \
-    float getAlpha() const { return alpha; }                                \
-    float getBwdAlpha() const { return bwdAlpha; }                          \
-  };                                                                        \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name =          \
-      "mkldnn_" #ACT_TYPE;                                                  \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA;        \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA; \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {              \
-    gMKLDNNActivationRegistrar                                              \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(             \
-            "mkldnn_" #ACT_TYPE);                                           \
+#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
+private:                                                           \
+  static const std::string name;                                   \
+                                                                   \
+public:                                                            \
+  const std::string& getName() const { return name; }              \
+  }                                                                \
+  ;                                                                \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
+      "mkldnn_" #ACT_TYPE;                                         \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
+    gMKLDNNActivationRegistrar                                     \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
+            "mkldnn_" #ACT_TYPE);                                  \
   });
 
+/**
+ * @def DEFINE_MKLDNN_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)
+
+/**
+ * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
+    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
+private:                                                             \
+  static const float alpha;                                          \
+  static const float bwdAlpha;                                       \
+                                                                     \
+public:                                                              \
+  float getAlpha() const { return alpha; }                           \
+  float getBwdAlpha() const { return bwdAlpha; }                     \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
+
 /**
  * @brief MKLDNN Relu Activation.
  * Actually mkldnn_relu is Leaky Relu.
@@ -59,19 +81,129 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
  *  f(x) = negative_slope * x  (x <  0)
  * @note the negative_slope should be -0.f in forward
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, -0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
 
 /**
  * @brief MKLDNN Tanh Activation.
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, 0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
 
 /**
  * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
  *  f(x) = x                              (x >= 0)
  *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, 0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
+  const std::map<std::string, mkldnn::algorithm> algoMap = {
+      {"relu", algorithm::eltwise_relu},
+      {"tanh", algorithm::eltwise_tanh},
+      {"elu", algorithm::eltwise_elu}};
+  type.erase(0, 7);  // remove mkldnn_
+  algorithm algo = (algorithm)0;
+  mapGet(type, algoMap, &algo);
+  return algo;
+}
+
+void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  // note: alpha represents the NegativeSlope when used in relu.
+  float alpha = getAlpha();
+  float beta = getBeta();
+  algorithm algo = getAlgo(this->getName());
+  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                   algo,
+                                   val_->getMemoryDesc(),
+                                   alpha,
+                                   beta);
+  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
+  // use inplace for forward but save input value before submit
+  inVal_ = val_;
+  copyInVal_ = nullptr;
+  if (act.grad && algo == algorithm::eltwise_tanh) {
+    // tanh need save src input for backward
+    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+    CHECK(copyInVal_) << "should not be emptry";
+    pipelineFwd_.push_back(*copyInVal_);
+  }
+  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+  needResetBwd_ = true;
+}
+
+void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
+  if (!needResetBwd_) {
+    return;
+  }
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+  needResetBwd_ = false;
+  algorithm algo = getAlgo(this->getName());
+  float alpha = getBwdAlpha();
+  float beta = getBeta();
+  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  auto eng = CPUEngine::Instance().getEngine();
+  auto bwdDesc = eltwise_bwd::desc(
+      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+  CHECK(inVal_);
+  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwd_);
+}
+
+/**
+ * @brief MKLDNN Softmax Activation
+ */
+DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
+
+void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  int axis = 1;
+  auto fwdDesc = softmax_fwd::desc(
+      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
+  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
+  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+}
+
+Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  real* v = act.value->getData();
+  real threshold = exp(-64);
+#pragma omp parallel for
+  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
+    v[i] = v[i] < threshold ? threshold : v[i];
+  }
+  return Error();
+}
+
+Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+  Matrix::resizeOrCreate(sftMaxDot_,
+                         outputG->getHeight(),
+                         outputG->getWidth(),
+                         /* trans */ false,
+                         /* useGpu */ false);
+  Matrix::resizeOrCreate(sftMaxSum_,
+                         outputG->getHeight(),
+                         1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+  sftMaxDot_->dotMul(*outputG, *outputV);
+  sftMaxSum_->colMerge(*sftMaxDot_);
+  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  return Error();
+}
 
 ActivationFunction* MKLDNNActivation::create(const std::string& type) {
   return gMKLDNNActivationRegistrar.createByType(type);
@@ -84,4 +216,34 @@ std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
   return types;
 }
 
+void MKLDNNActivation::resetFwd(Argument& act) {
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+  cnt_ = act.value->getElementCnt();
+  pipelineFwd_.clear();
+  stream_.reset(new MKLDNNStream());
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+  if (val_ == nullptr) {
+    int bs = act.getBatchSize();
+    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+    int ic = cnt_ / bs / ih / iw;
+    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+    val_ = MKLDNNMatrix::create(
+        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+    CHECK(val_);
+    val_->downSpatial();
+  }
+}
+
+Error __must_check MKLDNNActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  return Error();
+}
+Error __must_check MKLDNNActivation::backward(Argument& act) {
+  resetBwd(act);
+  stream_->submit(pipelineBwd_);
+  return Error();
+}
 }  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
index 86ffe387366409d81a91740cc8cea886e618f7e2..dd16421fd6e93b49c30b1d3b601f95980afec57b 100644
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -36,6 +36,7 @@ protected:
   // mkldnn matrix, primitive, stream and pipeline
   MKLDNNMatrixPtr val_;
   MKLDNNMatrixPtr grad_;
+  std::shared_ptr<mkldnn::engine> engine_;
   std::shared_ptr<MKLDNNStream> stream_;
   std::shared_ptr<mkldnn::primitive> fwd_;
   std::shared_ptr<mkldnn::primitive> bwd_;
@@ -48,8 +49,18 @@ public:
   static ActivationFunction* create(const std::string& type);
   static std::vector<std::string> getAllRegisteredTypes();
   virtual const std::string& getName() const = 0;
-  virtual Error __must_check forward(Argument& act) = 0;
-  virtual Error __must_check backward(Argument& act) = 0;
+  /**
+   * reset the forward primitives
+   */
+  virtual void resetFwd(Argument& act);
+  /**
+   * reset the backward primitives,
+   * can not merge this functions into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  virtual void resetBwd(Argument& act) {}
+  virtual Error __must_check forward(Argument& act);
+  virtual Error __must_check backward(Argument& act);
 };
 
 /**
@@ -59,6 +70,7 @@ public:
 class MKLDNNEltwiseActivation : public MKLDNNActivation {
   typedef mkldnn::eltwise_forward eltwise_fwd;
   typedef mkldnn::eltwise_backward eltwise_bwd;
+  typedef mkldnn::algorithm algorithm;
 
 protected:
   // save the forward primitive desc, which can be used backward
@@ -70,9 +82,7 @@ protected:
 
 public:
   MKLDNNEltwiseActivation() {}
-
   ~MKLDNNEltwiseActivation() {}
-
   virtual const std::string& getName() const = 0;
 
   // in common, the alpha of forward and backward should be equal.
@@ -80,104 +90,30 @@ public:
   virtual float getAlpha() const = 0;
   virtual float getBwdAlpha() const = 0;
   virtual float getBeta() const { return 0.f; }
-  virtual mkldnn::algorithm getAlgo(const std::string& type) const {
-    if (type == "mkldnn_relu") {
-      return mkldnn::algorithm::eltwise_relu;
-    } else if (type == "mkldnn_tanh") {
-      return mkldnn::algorithm::eltwise_tanh;
-    } else if (type == "mkldnn_elu") {
-      return mkldnn::algorithm::eltwise_elu;
-    } else {
-      LOG(FATAL) << "Unkown eltwise activation type: " << type;
-    }
-    return (mkldnn::algorithm)0;
-  }
-
-  /**
-   * reshape and reset the forward primitives
-   */
-  void resetFwd(Argument& act) {
-    if (cnt_ == act.value->getElementCnt()) {
-      return;
-    }
-    cnt_ = act.value->getElementCnt();
-    stream_.reset(new MKLDNNStream());
-    auto eng = CPUEngine::Instance().getEngine();
-
-    // get algo setting
-    mkldnn::algorithm algo = getAlgo(this->getName());
-    // note: alpha represents the NegativeSlope when used in relu.
-    float alpha = getAlpha();
-    float beta = getBeta();
-
-    /// forward
-    pipelineFwd_.clear();
-    val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
-    if (val_ == nullptr) {
-      int bs = act.getBatchSize();
-      int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
-      int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
-      int ic = cnt_ / bs / ih / iw;
-      CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
-      val_ = MKLDNNMatrix::create(
-          act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, eng);
-      CHECK(val_);
-    }
-    auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
-                                     algo,
-                                     val_->getMemoryDesc(),
-                                     alpha,
-                                     beta);
-    fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, eng));
-    // use inplace for forward but save input value before submit
-    inVal_ = val_;
-    copyInVal_ = nullptr;
-    if (act.grad && algo == mkldnn::algorithm::eltwise_tanh) {
-      // tanh need save src input for backward
-      inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
-      copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
-      CHECK(copyInVal_) << "should not be emptry";
-      pipelineFwd_.push_back(*copyInVal_);
-    }
-    fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
-    pipelineFwd_.push_back(*fwd_);
-    needResetBwd_ = true;
-  }
+  virtual algorithm getAlgo(std::string type) const;
+  void resetFwd(Argument& act) override;
+  void resetBwd(Argument& act) override;
+};
 
-  /**
-   * reset the backward primitives, can not merge into resetFwd as the grad data
-   * would be changing before backward.
-   */
-  void resetBwd(Argument& act) {
-    if (!needResetBwd_) {
-      return;
-    }
-    needResetBwd_ = false;
-    mkldnn::algorithm algo = getAlgo(this->getName());
-    float alpha = getBwdAlpha();
-    float beta = getBeta();
-    grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
-    auto eng = CPUEngine::Instance().getEngine();
-    auto bwdDesc = eltwise_bwd::desc(
-        algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
-    auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
-    CHECK(inVal_);
-    bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
-    pipelineBwd_.clear();
-    pipelineBwd_.push_back(*bwd_);
-  }
+/**
+ * @brief Base class of MKLDNN softmax Activation,
+ * only have mkldnn forward, use cpu implement for backward.
+ */
+class MKLDNNSoftmaxActivation : public MKLDNNActivation {
+  typedef mkldnn::softmax_forward softmax_fwd;
 
-  Error __must_check forward(Argument& act) {
-    resetFwd(act);
-    stream_->submit(pipelineFwd_);
-    return Error();
-  }
+private:
+  // for backward
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sftMaxDot_;
 
-  Error __must_check backward(Argument& act) {
-    resetBwd(act);
-    stream_->submit(pipelineBwd_);
-    return Error();
-  }
+public:
+  MKLDNNSoftmaxActivation() {}
+  ~MKLDNNSoftmaxActivation() {}
+  virtual const std::string& getName() const = 0;
+  void resetFwd(Argument& act) override;
+  Error __must_check forward(Argument& act) override;
+  Error __must_check backward(Argument& act) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 88b047c89bd40aba1afc456c22a2870c62989c1c..9a0abd291ae8fae43b0e95c7371f3ce35d1261ec 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -64,7 +64,7 @@ bool MKLDNNConvLayer::init(const LayerMap& layerMap,
 
   // create biases
   if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
   }
   return true;
 }
@@ -251,22 +251,31 @@ void MKLDNNConvLayer::resetInValue(
   // create buffer and reorder if input value do not match
   cpuInVal_ = nullptr;
   cvtInVal_ = nullptr;
-  if (inputIsOnlyMKLDNN()) {
-    MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-    CHECK(dnnIn) << "Input should be MKLDNNMatrix";
-    if (dnnIn->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      CHECK_EQ(dnnIn->getFormat(), format::nc);
+
+  MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
+  if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
+    in = dnnIn;
+    return;
+  }
+  if (dnnIn) {
+    if (dnnIn->getFormat() == format::nc) {
       CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
       // create a new one with nchw format and same data
       memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
       dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-      CHECK(dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc());
     }
-    in = dnnIn;
+    if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
+      in = dnnIn;
+      return;
+    }
+    cpuInVal_ = dnnIn;
+    in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
+    cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
+    CHECK(cvtInVal_) << "should not be emptry";
   } else {
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
     memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-    cpuInVal_ = MKLDNNMatrix::create(cpuIn, inDims, format::nchw, engine_);
+    cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
     if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
       // create new mkldnn matrix
       in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
@@ -535,7 +544,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
   } else {
     wgtValBwdData_ = wgtVal_;
   }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data"
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
                     << wgtValBwdData_->getFormat();
 }
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index afd092666bf8b8a3389b36aa1f0edb256a9968e6..8cbfbd0d2b9f2149f7c959aec5c4ae1de952f903 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -49,7 +49,7 @@ bool MKLDNNFcLayer::init(const LayerMap& layerMap,
 
   // create biases
   if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
   }
   return true;
 }
@@ -161,9 +161,16 @@ void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
 
 void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
                                       MKLDNNMatrixPtr& bias) {
+  format wgtFmt = format::oihw;
+  if (inVal_->getFormat() == format::nChw8c) {
+    wgtFmt = format::oIhw8i;
+  } else if (inVal_->getFormat() == format::nChw16c) {
+    wgtFmt = format::oIhw16i;
+  }
   wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, format::oihw, engine_);
+      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
   wgt->downSpatial();
+  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
 
   bias = (biases_ && biases_->getW())
              ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index d8555a833187ddf64b096135e920e5be2b3a8c2f..c09fd89462ef4fdaeaae3e122f96b0cc6ce373ea 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -115,6 +115,7 @@ public:
       copySeqInfoToOutputs();
       size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
       if (inputElemenCnt_ != elemenCnt) {
+        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
         // reset when input total sizes changed, not only the batchsize
         inputElemenCnt_ = elemenCnt;
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
@@ -142,6 +143,7 @@ public:
 
   void backward(const UpdateCallback& callback) override {
     if (needResetBwd_) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
       needResetBwd_ = false;
     }
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 1bfbbde4246a10eaf86693a6a2f237f390966db3..857d07df3e3088be28943d9e2fe58017e9e57f4a 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -222,8 +222,8 @@ static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
 }
 
 void testActivation(std::string& actType, const testActDesc& pm) {
-  // TODO(TJ): mkldnn_softmax not implemented, paddle do not have elu activation
-  if (actType == "mkldnn_softmax" || actType == "mkldnn_elu") {
+  // TODO(TJ): remove me when paddle support elu activation
+  if (actType == "mkldnn_elu") {
     return;
   }
   const std::string compareTypes[] = {actType, actType.erase(0, 7)};
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 0a6a0fd15c73330902552f7a9aa6339de24c1a18..75e8a989036f0b818687e1fec3e600bb90e86b22 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -69,8 +69,12 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
       return;
     }
 
-    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS>>>(
-        num_samples, infer_width, inference_data, label_data, accuracy_data);
+    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
+        1, PADDLE_CUDA_NUM_THREADS, 0,
+        reinterpret_cast<const platform::CUDADeviceContext&>(
+            ctx.device_context())
+            .stream()>>>(num_samples, infer_width, inference_data, label_data,
+                         accuracy_data);
   }
 };
 
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 2f40c059033ec649b29f6ecdee4fcedd128a63a6..ac3aeaf41e206c1deb74c7022c36f02c4777a84b 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -38,10 +38,10 @@ class CropKernel : public framework::OpKernel {
     auto out_stride = framework::stride(out->dims());
     auto offsets = context.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(
-        x->dims().size(), offsets.size(),
+        x->dims().size(), static_cast<int64_t>(offsets.size()),
         "Offsets size should be equal to dimension size of input tensor.");
     int64_t offset = 0;
-    for (int i = 0; i < offsets.size(); ++i) {
+    for (size_t i = 0; i < offsets.size(); ++i) {
       offset += (x_stride[i] * offsets[i]);
     }
     StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
@@ -57,7 +57,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     d_x->mutable_data<T>(context.GetPlace());
     auto offsets = context.Attr<std::vector<int>>("offsets");
     Eigen::array<std::pair<int, int>, D> paddings;
-    for (int i = 0; i < D; ++i) {
+    for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
       paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
     }
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 679f068c3d2bf0223ccf7bba82b003139f273125..2e16201e74c153888594ebe6679fb0036734dad4 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -23,30 +23,28 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
+                            "Input(Label) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) should be not null.");
 
     auto x = ctx.Input<Tensor>("X");
     auto label = ctx.Input<Tensor>("Label");
-    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
-                      "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
-                        "Input(X) and Input(Label) must be equal.");
+                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
-                        "Input(Label) must be 1.");
+                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
     }
 
     ctx.Output<Tensor>("Y")->Resize({x->dims()[0], 1});
@@ -60,38 +58,38 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) must not be null.");
+                            "Input(Label) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
-                            "Input(Y@GRAD) must not be null.");
+                            "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("X")),
+                            "Output(X@GRAD) should be not null.");
 
     auto x = ctx.Input<Tensor>("X");
     auto label = ctx.Input<Tensor>("Label");
     auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy->dims().size(), 2,
+                      "Input(Y@Grad)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
-                      "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) must "
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) must be 1.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
-                        "Input(X) and Input(Label) must be equal.");
+                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
-                        "Input(Label) must be 1.");
+                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
     }
 
     auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
@@ -104,23 +102,39 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
   CrossEntropyOpMaker(framework::OpProto *proto,
                       framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of CrossEntropyOp");
-    AddInput("Label", "The second input of CrossEntropyOp");
-    AddOutput("Y", "The output of CrossEntropyOp");
-    AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
-
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "where N is the batch size and D is the number of classes. "
+             "This input is a probability computed by the previous operator, "
+             "which is almost always the result of a softmax operator.");
+    AddInput(
+        "Label",
+        "(Tensor, default Tensor<int>), the ground truth which is "
+        "a 2-D tensor. "
+        "When softLabel is set to false, `Label` is a Tensor<int> with shape "
+        "[N x 1]. "
+        "When softLabel is set to true, `Label` is a Tensor<float/double> "
+        "with shape [N x K].");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a 2-D tensor "
+              "with shape [N x 1]. The cross entropy loss.");
+    AddAttr<bool>(
+        "softLabel",
+        "(bool, default false), a flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
+        .SetDefault(false);
     AddComment(R"DOC(
 CrossEntropy Operator.
 
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    soft_label = 0, Label[i, 0] indicates the class index for sample i:
+    softLabel = false, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    soft_label = 1, Label[i, j] indicates the soft label of class j
+    softLabel = true, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index ab6ad0e062269483948bf70e492c9431991221fb..18e44d77c9f62b296dc57952e546f844670c7d57 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -28,26 +28,49 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
+    Y[i] = -TolerableValue<T>()(log(X[i * D + label[i]]));
   }
 }
 
+template <typename T>
+__device__ __forceinline__ T sum_single_warp(T val) {
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
+}
+
 template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
-                                       const int N, const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    T sum = static_cast<T>(0);
-    for (int j = 0; j < D; j++) {
-      sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
-    }
-    Y[i] = -sum;
+                                       const int class_num) {
+  int tid = threadIdx.x;
+  extern __shared__ T d_sum[];
+  d_sum[tid] = 0;
+
+  int cur_idx = tid;
+  int next_idx = blockIdx.x * class_num + tid;
+  while (cur_idx < class_num) {
+    d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockDim.x;
+    cur_idx += blockDim.x;
+  }
+  __syncthreads();
+
+  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
+    __syncthreads();
   }
+
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
 }
 
-// TODO(qingqing): make zero setting an common function.
+// TODO(qingqing): make zero setting a common function.
 template <typename T>
-__global__ void zero(T* X, const int N) {
+__global__ void Zero(T* X, const int N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     X[i] = 0.0;
@@ -71,13 +94,10 @@ template <typename T>
 __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                                const T* label, const int N,
                                                const int D) {
-  // TOOD(qingqing): optimize for this kernel
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    for (int j = 0; j < D; ++j) {
-      int idx = i * D + j;
-      dX[idx] = -label[idx] * dY[i] / X[idx];
-    }
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < N * D) {
+    int row_ids = ids / D;
+    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
   }
 }
 
@@ -86,29 +106,36 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "This kernel only runs on GPU device.");
 
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Output<Tensor>("Y");
-    auto label = ctx.Input<Tensor>("Label");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
 
-    auto* x_data = x->data<T>();
-    y->mutable_data<T>(ctx.GetPlace());
-    auto* y_data = y->data<T>();
+    const T* x_data = x->data<T>();
+    T* y_data = y->mutable_data<T>(ctx.GetPlace());
 
-    int n = x->dims()[0];
-    int d = x->dims()[1];
-    int block = 512;
-    int grid = (n + block - 1) / block;
-    // TODO(qingqing) launch kernel on specified stream
-    // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
+
+    if (ctx.Attr<bool>("softLabel")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
-                                                 d);
+      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+
+      SoftCrossEntropyKernel<
+          T><<<batch_size, block, block * sizeof(T),
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(y_data, x_data, label_data, class_num);
     } else {
       auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
-      CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
+      int block = 512;
+      int grid = (batch_size + block - 1) / block;
+      CrossEntropyKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(y_data, x_data, label_data,
+                                           batch_size, class_num);
     }
   }
 };
@@ -118,33 +145,43 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "This kernel only runs on GPU device.");
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
+    const T* dy_data =
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* x_data = x->data<T>();
 
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
 
-    int n = x->dims()[0];
-    int d = x->dims()[1];
     int block = 512;
-    int grid = (n * d + block - 1) / block;
-    zero<T><<<grid, block>>>(dx_data, n * d);
-    grid = (n + block - 1) / block;
-    // TODO(qingqing): launch kernel on specified stream
-    // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (ctx.Attr<bool>("softLabel")) {
       auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
-          dx_data, dy_data, x_data, label_data, n, d);
+      SoftCrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(dx_data, dy_data, x_data, label_data,
+                                           batch_size, class_num);
     } else {
+      Zero<T><<<grid, block, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(
+                    ctx.device_context())
+                    .stream()>>>(dx_data, batch_size * class_num);
+
       auto* label_data = label->data<int>();
-      CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
-                                                     label_data, n, d);
+      grid = (batch_size + block - 1) / block;
+      CrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(dx_data, dy_data, x_data, label_data,
+                                           batch_size, class_num);
     }
   }
 };
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 1b4b23ac2029138afadef0168262203ac2e20430..255b2e9f5ea7566cca7fd3914e38da804b7c7006 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/hostdevice.h"
 
@@ -20,53 +21,51 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-HOSTDEVICE T tolerable_value(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-  const T kApproInf = 1e20;
-  if (x == INFINITY) {
-    return kApproInf;
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
   }
-  if (x == -INFINITY) {
-    return -kApproInf;
-  }
-  return x;
-}
+};
 
 template <typename T>
 class CrossEntropyOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Output<Tensor>("Y");
-
-    auto* x_data = x->data<T>();
-    y->mutable_data<T>(ctx.GetPlace());
-    auto* y_data = y->data<T>();
-
-    int batch_size = x->dims()[0];
-    int class_num = x->dims()[1];
-
-    if (ctx.Attr<int>("soft_label") == 1) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = 0;
-      for (int i = 0; i < batch_size; ++i) {
-        T sum = static_cast<T>(0);
-        for (int j = 0; j < class_num; ++j) {
-          sum += label_data[index] * tolerable_value(std::log(x_data[index]));
-          y_data[i] = -sum;
-          index++;
-        }
-      }
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
+    T* y_data = y->mutable_data<T>(ctx.GetPlace());
+
+    const int batch_size = x->dims()[0];
+    if (ctx.Attr<bool>("softLabel")) {
+      auto prob = EigenMatrix<T>::From(*x);
+      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      auto loss = EigenMatrix<T>::From(*y);
+
+      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -((lbl_mat * prob.log().unaryExpr(TolerableValue<T>()))
+                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
     } else {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      const int class_num = x->dims()[1];
+      const T* x_data = x->data<T>();
+
+      const int* label_data = labels->data<int>();
       for (int i = 0; i < batch_size; ++i) {
         int index = i * class_num + label_data[i];
-        y_data[i] = -tolerable_value(std::log(x_data[index]));
+        y_data[i] = -TolerableValue<T>()(std::log(x_data[index]));
       }
     }
   }
@@ -77,33 +76,32 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
-
-    int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
-
-    // TODO(qingqing): make zero setting an common function.
-    if (ctx.Attr<int>("soft_label") == 1) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = 0;
-      for (int i = 0; i < batch_size; ++i) {
-        for (int j = 0; j < class_num; ++j) {
-          dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
-          index++;
-        }
-      }
+    if (ctx.Attr<bool>("softLabel")) {
+      auto x_mat = EigenMatrix<T>::From(*x);
+      auto dy_mat = EigenMatrix<T>::From(*dy);
+      auto lbl_mat = EigenMatrix<T>::From(*label);
+      auto dx_mat = EigenMatrix<T>::From(*dx);
+
+      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -(lbl_mat * dy_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) /
+            x_mat);
     } else {
-      auto* label_data = label->data<int>();
+      int batch_size = x->dims()[0];
+      const T* dy_data = dy->data<T>();
+      const T* x_data = x->data<T>();
+      const int* label_data = label->data<int>();
+
+      // TODO(qingqing): make zero setting a common function.
       memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+
       for (int i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
         int index = i * class_num + label_data[i];
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 7a6351b61287eccb0454fe279ea9bf38ed055bdf..2130eda6a42c893d8ec251a7022a0bfa44433bb7 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -28,13 +28,10 @@ class DropoutOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
     PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
 
     auto dims = ctx.Input<Tensor>("X")->dims();
     ctx.Output<Tensor>("Out")->Resize(dims);
-    if (ctx.Attr<int>("is_training") == 1) {
+    if (ctx.Attr<bool>("is_training")) {
       ctx.Output<Tensor>("Mask")->Resize(dims);
     }
     ctx.ShareLoD("X", /*->*/ "Out");
@@ -49,8 +46,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
         .SetDefault(.5f);
-    // TODO(xinghai-sun): use bool for is_training after bool is supported.
-    AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
+    AddAttr<bool>("is_training", "Whether in training phase.").SetDefault(true);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
@@ -59,7 +55,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Dropout Operator.
 
-"Dropout" refers to randomly dropping out units in a nerual network. It is a
+'Dropout' refers to randomly dropping out units in a nerual network. It is a
 regularization technique for reducing overfitting by preventing neuron
 co-adaption during training. The dropout operator randomly set (according to
 the given dropout probability) the outputs of some units to zero, while others
@@ -75,8 +71,8 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(ctx.Attr<bool>("is_training"),
+                   "GradOp is only callable when is_training is true");
 
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
@@ -85,9 +81,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
     PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
     PADDLE_ENFORCE_EQ(x_dims, out_dims,
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 186237fb238add37f32403309a0f7e8a9846d335..a04e4a22cc09d4e8106a528e490ccf8e90681c08 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -59,7 +59,7 @@ class GPUDropoutKernel : public framework::OpKernel {
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
     auto place = context.GetEigenDevice<Place>();
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 82eafee0e0e7db7b4b4ae5405f37146d061aefd5..d57f64afcb3558aeea6aed23fae06866e9af874a 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -35,7 +35,7 @@ class CPUDropoutKernel : public framework::OpKernel {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
 
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int seed = context.Attr<int>("seed");
@@ -65,8 +65,8 @@ template <typename Place, typename T>
 class DropoutGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
+                   "GradOp is only callable when is_training is true");
 
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f7b654d69f081dfa85b0d61960eb52b7982faa1
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_add_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseAddOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseAddOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("add", "Out = X + Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
+            elementwise_add_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_avg_pool_op.cu b/paddle/operators/elementwise_add_op.cu
similarity index 74%
rename from paddle/operators/sequence_avg_pool_op.cu
rename to paddle/operators/elementwise_add_op.cu
index bc9d1611fccd17c99b914b6ef59995288a9ebbd6..85d063a76b5592c716a5bdf23a0993976abc6ae4 100644
--- a/paddle/operators/sequence_avg_pool_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
@@ -13,13 +13,13 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-
-#include "paddle/operators/sequence_avg_pool_op.h"
+#include "paddle/operators/elementwise_add_op.h"
 
 namespace ops = paddle::operators;
+
 REGISTER_OP_GPU_KERNEL(
-    sequence_avg_pool,
-    ops::SequenceAvgPoolKernel<paddle::platform::GPUPlace, float>);
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    sequence_avg_pool_grad,
-    ops::SequenceAvgPoolGradKernel<paddle::platform::GPUPlace, float>);
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e9f1ffba6fb23f5394713c67aa4363b85717f50
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseAddKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseAddGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.sum();
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseAddGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
+                           ElementwiseAddOneGradFunctor<T>,
+                           ElementwiseAddBroadCastGradFunctor<T>,
+                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6898150d310d0c4fdefae5a58a5792a72f9889e
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_div_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseDivOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Div", "Out = X / Y");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
+            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b96aa31748c77f0d07f9bb7fb19235239983abd5
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_div_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bd7c8ea548c46ec9b4c5a085e4e70d5dd162f3a
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseDivKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseDivGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseDivGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivBroadCastGradFunctor<T>,
+                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index 02bd4c7b85790edba781f234f34bbec160844238..f2544b54d6bc543a50d8de03d482333b485bc076 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -17,104 +17,25 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
-class ElementWiseMulOp : public framework::OperatorWithKernel {
+class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of ElementWiseMulOp should not be null.");
-
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto y_dim = ctx.Input<Tensor>("Y")->dims();
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
-    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
-    ctx.ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ElementWiseMulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ElementWiseMulOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of elementwise mul op");
-    AddInput("Y", "The second input of elementwise mul op");
-    AddAttr<int>("axis",
-                 R"DOC(
-When shape(Y) does not equal shape(X),Y will be broadcasted 
-to match the shape of X and axis should be dimension index Y in X
-        )DOC")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-
-    AddOutput("Out", "The output of elementwise mul op");
-    AddComment(R"DOC(
-Limited elementwise multiple operator.The equation is: Out = X ⊙ Y.
-1. The shape of Y should be same with X or
-2. Y's shape is a subset of X. 
-   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
-   example:
-
-      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-
-Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input X.
-)DOC");
+  ElementwiseMulOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Mul", "Out = X ⊙ Y");
+    AddComment(comment_);
   }
 };
 
-class ElementWiseMulOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
-
-    if (x_grad) {
-      x_grad->Resize(x_dims);
-    }
-
-    if (y_grad) {
-      y_grad->Resize(y_dims);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementWiseMulOp, ops::ElementWiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementWiseMulOpGrad);
+REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
+            elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index 56f2087c22c6c599a3c5aef36eb0fe3eac295bef..da08a75596c4d3b89dc8892bd4405464fec96389 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -19,7 +19,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 6d58da580b81b9e0a8ae170eec1a73638b190df8..1eaf2e3efc97a32739efcaf37066817ee173fadc 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -13,171 +13,104 @@
    limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
-/*
- * Out = X ⊙ Y
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- */
-
-inline void get_mid_dims(const framework::DDim& x_dims,
-                         const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
-  pre = 1;
-  n = 1;
-  post = 1;
-  for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
-                      "Broadcast dimension mismatch.");
-    n *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
-  }
-}
 
 template <typename Place, typename T>
-class ElementWiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
+    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
+  }
+};
 
+template <typename T>
+struct ElementwiseMulGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
 
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
-
-    if (x_dims == y_dims || product(y_dims) == 1) {
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_e;
-      return;
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e;
     }
 
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                   "Axis should be in range [0, x_dims)");
-
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-    if (post == 1) {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
-    } else {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = x_e * dz_e;
     }
   }
 };
 
-template <typename Place, typename T>
-class ElementWiseMulGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
+template <typename T>
+struct ElementwiseMulBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dout_e = framework::EigenVector<T>::Flatten(*dout);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
 
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
     }
+
     if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
     }
+  }
+};
 
-    if (x_dims == y_dims || product(y_dims) == 1) {
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) = x_e * dout_e;
-      }
-      return;
+template <typename T>
+struct ElementwiseMulBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
     }
 
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-
-    // TODO(gongweibao): wrap reshape to a function.
-    if (post == 1) {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                           .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 2>(pre, n))
-                .sum(Eigen::array<int, 1>{{0}});
-      }
-      return;
-    } else {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                           .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                .sum(Eigen::array<int, 2>{{0, 2}});
-      }
-      return;
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
     }
   }
 };
 
+template <typename Place, typename T>
+class ElementwiseMulGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulBroadCastGradFunctor<T>,
+                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f224722c1bec6716e68de9da2509250f7d4b37ae
--- /dev/null
+++ b/paddle/operators/elementwise_op.h
@@ -0,0 +1,312 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
+ */
+inline void get_mid_dims(const framework::DDim& x_dims,
+                         const framework::DDim& y_dims, const int axis,
+                         int& pre, int& n, int& post) {
+  pre = 1;
+  n = 1;
+  post = 1;
+  for (int i = 0; i < axis; ++i) {
+    pre *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
+                      "Broadcast dimension mismatch.");
+    n *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    post *= x_dims[i];
+  }
+}
+
+#define EIGEN_FUNCTOR(name, eigen_op)                                          \
+  struct Eigen##name##Functor {                                                \
+    template <typename Place, typename T>                                      \
+    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
+                    framework::Tensor* z,                                      \
+                    const framework::ExecutionContext& ctx) {                  \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast(const framework::Tensor* x,                       \
+                             const framework::Tensor* y, framework::Tensor* z, \
+                             const framework::ExecutionContext& ctx, int pre,  \
+                             int n) {                                          \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast2(const framework::Tensor* x,                      \
+                              const framework::Tensor* y,                      \
+                              framework::Tensor* z,                            \
+                              const framework::ExecutionContext& ctx, int pre, \
+                              int n, int post) {                               \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+  }
+
+template <class functor, typename Place, typename T>
+void ElementwiseCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.")
+
+  if (x_dims == y_dims || product(y_dims) == 1) {
+    functor f;
+    f.template Run<Place, T>(x, y, z, ctx);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor f;
+    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    return;
+  } else {
+    functor f;
+    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    return;
+  }
+}
+
+#define EIGEN_ADD(x, y) ((x) + (y))
+EIGEN_FUNCTOR(Add, EIGEN_ADD);
+
+#define EIGEN_SUB(x, y) ((x) - (y))
+EIGEN_FUNCTOR(Sub, EIGEN_SUB);
+
+#define EIGEN_MUL(x, y) ((x) * (y))
+EIGEN_FUNCTOR(Mul, EIGEN_MUL);
+
+#define EIGEN_DIV(x, y) ((x) / (y))
+EIGEN_FUNCTOR(Div, EIGEN_DIV);
+
+template <typename Place, typename T, typename functor, typename functor1,
+          typename broadcastfunctor, typename broadcast2functor>
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Input<Tensor>("Out");
+  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+  auto place = ctx.GetEigenDevice<Place>();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  if (dx) {
+    dx->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dy) {
+    dy->mutable_data<T>(ctx.GetPlace());
+  }
+
+  if (x_dims == y_dims) {
+    functor f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  if (product(y_dims) == 1) {
+    functor1 f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+
+  if (post == 1) {
+    broadcastfunctor f;
+    f(place, x, y, out, dx, dy, dout, pre, n);
+    return;
+  } else {
+    broadcast2functor f;
+    f(place, x, y, out, dx, dy, dout, pre, n, post);
+    return;
+  }
+}
+
+class ElementwiseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  using Tensor = framework::Tensor;
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of elementwise op should not be null.");
+
+    auto x_dim = ctx.Input<Tensor>("X")->dims();
+    auto y_dim = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.")
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ElementwiseOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", R"DOC(
+The first input of elementwise op, it's a tensor of any dimensions.
+)DOC");
+    AddInput("Y", R"DOC(
+The sencond input of elementwise op, it's a tensor and it's dimensions
+must be small or equal to X's dimensions.
+)DOC");
+    AddAttr<int>("axis",
+                 R"DOC(
+When the shape(Y) does not equal the shape(X),Y will be broadcasted 
+to match the shape of X and axis should be dimension index Y in X
+        )DOC")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+
+    AddOutput("Out", "The output of elementwise op");
+    comment_ = R"DOC(
+Limited elementwise {name} operator.The equation is: Out = {equation}.
+1. The shape of Y should be same with X or
+2. Y's shape is a subset of X. 
+   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
+
+   example:
+      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input X.
+)DOC";
+    AddComment(comment_);
+  }
+
+ protected:
+  std::string comment_;
+
+  void Replace(std::string& src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+
+  void SetComment(std::string name, std::string equation) {
+    Replace(comment_, "{name}", name);
+    Replace(comment_, "{equation}", equation);
+  }
+};
+
+class ElementwiseOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  using Tensor = framework::Tensor;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.")
+
+    if (x_grad) {
+      x_grad->Resize(x_dims);
+    }
+
+    if (y_grad) {
+      y_grad->Resize(y_dims);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31c37ff7ab5595c29f973929387d3945b6f3aaf8
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_sub_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseSubOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseSubOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Sub", "Out = X - Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
+            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0efb92fce9975ed9fa029a3ce919589d09efb0d7
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_sub_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6bc66cd0e1594a8bc7070e2f182401b92d1c88e
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseSubKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseSubGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e.sum();
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseSubGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
+                           ElementwiseSubOneGradFunctor<T>,
+                           ElementwiseSubBroadCastGradFunctor<T>,
+                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 708344046760691aa2da562eb1ee3d8b130c5f18..62f63b4f3c876e084e2468001e8bcb9310d16a82 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -77,7 +77,10 @@ class LookupTableCUDAKernel : public framework::OpKernel {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTable<T, 128, 8, 8><<<grids, threads>>>(output, table, ids, N, K, D);
+    LookupTable<T, 128, 8, 8><<<
+        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                               context.device_context())
+                               .stream()>>>(output, table, ids, N, K, D);
   }
 };
 
@@ -102,8 +105,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTableGrad<T, 128, 8, 8><<<grids, threads>>>(d_table, d_output, ids, N,
-                                                      K, D);
+    LookupTableGrad<T, 128, 8, 8><<<
+        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                               context.device_context())
+                               .stream()>>>(d_table, d_output, ids, N, K, D);
   }
 };
 
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3600f199770c4b8c9a6561b4c270a91bc8b20c0b
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lstm_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LstmUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of LSTM should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("C_prev"),
+                            "Input(C_prev) of LSTM should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("C"),
+                            "Output(C) of LSTM should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("H"),
+                            "Output(H) of LSTM should not be null.");
+
+    auto *x = ctx.Input<framework::Tensor>("X");
+    auto *c_prev = ctx.Input<framework::Tensor>("C_prev");
+
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE(x->dims()[0] == c_prev->dims()[0],
+                   "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE(x->dims()[1] == c_prev->dims()[1] * 4,
+                   "Dimension of FC should equal to prev state * 4");
+
+    int b_size = c_prev->dims()[0];  // batch size
+    int s_dim = c_prev->dims()[1];   // state dim
+    ctx.Output<framework::LoDTensor>("C")->Resize({b_size, s_dim});
+    ctx.Output<framework::LoDTensor>("H")->Resize({b_size, s_dim});
+  }
+};
+
+template <typename AttrType>
+class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LstmUnitOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "FC input before the non-linear activation.");
+    AddInput(
+        "C_prev",
+        "The cell state tensor of last time-step in the Lstm Unit operator.");
+    AddOutput("C", "The cell tensor of Lstm Unit operator.");
+    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
+
+    AddComment(R"DOC(Lstm-Unit Operator
+
+Equation: 
+  i, f, o, j = split(X)
+  C = C_prev * sigm(f + forget_bias) + sigm(i) * tanh(j)
+  H = C * sigm(o)
+   
+)DOC");
+    AddAttr<AttrType>("forget_bias", "The forget bias of Lstm Unit.")
+        .SetDefault(0.0);
+  }
+};
+
+class LstmUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("C")),
+                            "Input(C@GRAD) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("H")),
+                            "Input(H@GRAD) should not be null");
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("C_prev"))
+        ->Resize(ctx.Input<Tensor>("C_prev")->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker<float>,
+            lstm_unit_grad, ops::LstmUnitGradOp);
+REGISTER_OP_CPU_KERNEL(lstm_unit,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e5e4978994c281416a65af5f8ffdec688768d63
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.cu
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__device__ Dtype cuda_sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype cuda_tanh(const Dtype x) {
+  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
+}
+
+template <typename T>
+__global__ void LSTMUnitKernel(const int nthreads, const int dim,
+                               const T* C_prev, const T* X, T* C, T* H,
+                               const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+
+    const T* X_offset = X + 4 * dim * n;
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = f * c_prev + i * g;
+    C[index] = c;
+    const T tanh_c = cuda_tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename T>
+__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
+                                       const T* C_prev, const T* X, const T* C,
+                                       const T* H, const T* C_diff,
+                                       const T* H_diff, T* C_prev_diff,
+                                       T* X_diff, const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const T* X_offset = X + 4 * dim * n;
+    T* c_prev_diff = C_prev_diff + index;
+    T* X_diff_offset = X_diff + 4 * dim * n;
+    T* i_diff = X_diff_offset + d;
+    T* f_diff = X_diff_offset + 1 * dim + d;
+    T* o_diff = X_diff_offset + 2 * dim + d;
+    T* g_diff = X_diff_offset + 3 * dim + d;
+
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = C[index];
+    const T tanh_c = cuda_tanh(c);
+    const T c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    *c_prev_diff = c_term_diff * f;
+    *i_diff = c_term_diff * g * i * (1 - i);
+    *f_diff = c_term_diff * c_prev * f * (1 - f);
+    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
+    *g_diff = c_term_diff * i * (1 - g * g);
+  }
+}
+
+template <typename T, typename AttrType = T>
+class LstmUnitOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int block = 512;
+    int n = b_size * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
+  }
+};
+
+template <typename T, typename AttrType = T>
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    int block = 512;
+    int n = N * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitGradientKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, C_diff,
+                                               H_diff, C_prev_diff, X_diff,
+                                               forget_bias);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..683034fe15df8cabfdff5e856adb5c0467055064
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+using framework::Tensor;
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename Place, typename T, typename AttrType = T>
+class LstmUnitKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    for (int n = 0; n < b_size; ++n) {
+      for (int d = 0; d < D; ++d) {
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = f * c_prev + i * g;
+        C[d] = c;
+        const T tanh_c = tanh(c);
+        H[d] = o * tanh_c;
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+    }
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class LstmUnitGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    for (int n = 0; n < N; ++n) {
+      for (int d = 0; d < D; ++d) {
+        T* c_prev_diff = C_prev_diff + d;
+        T* i_diff = X_diff + d;
+        T* f_diff = X_diff + 1 * D + d;
+        T* o_diff = X_diff + 2 * D + d;
+        T* g_diff = X_diff + 3 * D + d;
+
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = C[d];
+        const T tanh_c = tanh(c);
+        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+        *c_prev_diff = c_term_diff * f;
+        *i_diff = c_term_diff * g * i * (1 - i);
+        *f_diff = c_term_diff * c_prev * f * (1 - f);
+        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+        *g_diff = c_term_diff * i * (1 - g * g);
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+      C_diff += D;
+      H_diff += D;
+      X_diff += 4 * D;
+      C_prev_diff += D;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index def4b01da098fc960ce7c0e497732fbcc2579945..ba653afa2cb175ae2e5e21088b6dc7ba76a6018f 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -48,6 +48,32 @@ void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
               beta, C, ldc);
 }
 
+template <>
+void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
+                                     const bool transA, const bool transB,
+                                     const int M, const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const int lda, const float* B,
+                                     const int ldb, const float beta, float* C,
+                                     const int ldc) {
+  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
+                                      const bool transA, const bool transB,
+                                      const int M, const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const int lda, const double* B,
+                                      const int ldb, const double beta,
+                                      double* C, const int ldc) {
+  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
 template <>
 void matmul<platform::CPUPlace, float>(
     const platform::DeviceContext& context, const framework::Tensor& matrix_a,
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 71563b77b4b262c3f1e17ae7c4381da56ba780a3..649f1f352c2a4a5ebaa0cb00ffb2e4de8aa4961a 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -63,6 +63,42 @@ void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
       cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
+template <>
+void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
+                                     const bool transA, const bool transB,
+                                     const int M, const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const int lda, const float* B,
+                                     const int ldb, const float beta, float* C,
+                                     const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+}
+
+template <>
+void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
+                                      const bool transA, const bool transB,
+                                      const int M, const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const int lda, const double* B,
+                                      const int ldb, const double beta,
+                                      double* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+}
+
 template <>
 void matmul<platform::GPUPlace, float>(
     const platform::DeviceContext& context, const framework::Tensor& matrix_a,
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index d8518e77fa7b4abdbcf08b7983013c24806e14ca..43306fca73387b7b212f556a2b187df113a1b327 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -70,6 +70,13 @@ void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
           const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
           const T alpha, const T* A, const T* B, const T beta, T* C);
 
+// gemm wrapper with stride args for matrix uncontinuous in memory
+template <typename Place, typename T>
+void gemm(const platform::DeviceContext& context, const bool transA,
+          const bool transB, const int M, const int N, const int K,
+          const T alpha, const T* A, const int lda, const T* B, const int ldb,
+          const T beta, T* C, const int ldc);
+
 // matrix multiply with continuous memory
 template <typename Place, typename T>
 void matmul(const platform::DeviceContext& context,
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 7e339457f7f08ff16162f399064a4b4dca594d7f..f272f7e5135e7092618b8c94ee55faf1cfd8e8a5 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -72,4 +72,174 @@ TEST(math_function, trans_mul_notrans) {
   EXPECT_EQ(out_ptr[8], 29);
   delete gpu_place;
 }
+
+TEST(math_function, gemm_notrans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+
+  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_trans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+
+  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
 #endif
+
+TEST(math_function, gemm_notrans_cblas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
+
+TEST(math_function, gemm_trans_clbas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e77b86b5698a263b850a973cd1b8644a0aa2201
--- /dev/null
+++ b/paddle/operators/multiplex_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class MultiplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
+                   "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) shouldn't be null.");
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto *out = ctx.Output<LoDTensor>("Out");
+    auto num_ins = ins.size();
+    PADDLE_ENFORCE(num_ins > 2,
+                   "multiplex operator should have more than 2 inputs.");
+    PADDLE_ENFORCE_EQ(ins[0]->dims().size(), 1,
+                      "The first input must be a index vector.");
+    auto in_dim = ins[1]->dims();
+
+    for (size_t i = 2; i < num_ins; i++) {
+      auto dim = ins[i]->dims();
+      PADDLE_ENFORCE(
+          in_dim == dim,
+          "All the input tensors except the first one must have the same size");
+    }
+    out->Resize(in_dim);
+  }
+};
+
+class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MultiplexOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensors of multiplex operator.").AsDuplicable();
+    AddOutput("Out", "The output tensor of multiplex operator.");
+    AddComment(R"DOC(Multiplex operator
+
+Multiplex multiple tensors according to the index provided by the first
+input tensor.
+
+ins[0]: the index tensor.
+ins[1:N]: the candidate output tensors.
+For each index i from 0 to batchSize - 1, the output is the i-th row of the
+the (index[i] + 1)-th tensor.
+
+For i-th row of the output tensor:
+
+y[i][j] = x_{k}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+
+where y is the output tensor. `x_{k}` is the k-th input tensor
+and `k = x{0}[i] + 1`.
+
+)DOC");
+  }
+};
+
+class MultiplexGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
+                   "Input(X) should not be null");
+    PADDLE_ENFORCE(!ctx.MultiOutputVar(framework::GradVarName("X")).empty(),
+                   "Output(X@Grad) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) shouldn't be null.");
+    auto d_ins = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
+    auto ins = ctx.MultiInput<Tensor>("X");
+    // don't compute gradient for index (ins[0])
+    for (size_t i = 1; i < ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->Resize(ins[i]->dims());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
+            ops::MultiplexGradOp);
+REGISTER_OP_CPU_KERNEL(
+    multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4736f15bd594178168e3bcf799142d0fc18bff13
--- /dev/null
+++ b/paddle/operators/multiplex_op.cu
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MultiplexGPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    // copy index to cpu
+    framework::Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+    auto* index = index_t_cpu.data<T>();
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      PADDLE_ENFORCE_LT(k, ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MultiplexGradGPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto d_ins =
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    for (size_t i = 1; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    // copy index to cpu
+    framework::Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+    auto* index = index_t_cpu.data<T>();
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    multiplex, ops::MultiplexGPUKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradGPUKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..98466426bd90bc30a22ecf74e6739e2d4ad1d21d
--- /dev/null
+++ b/paddle/operators/multiplex_op.h
@@ -0,0 +1,78 @@
+
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MultiplexCPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    auto* index = ins[0]->data<T>();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MultiplexGradCPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto d_ins =
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    for (size_t i = 1; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    auto* index = ins[0]->data<T>();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T));
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index ad985839f5908d9235a4dbefc9b841362810114e..e7deaf9940699b938e4f36358c2c7f3ba15e918b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -80,7 +80,6 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
   // Now all variables in scope must be created outside of op.
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
   PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
-  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "net_op has no outputs");
 
   if (seq_len_ > step_scopes->size()) {
     for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
@@ -129,8 +128,8 @@ const rnn::ArgumentName RecurrentOp::kArgName{
     "memories", "pre_memories", "boot_memories"};
 
 const rnn::ArgumentName RecurrentGradientOp::kArgName{
-    "step_net", "step_scopes",  "outlink@grad",      "inlink@grad",
-    "memories", "pre_memories", "boot_memories@grad"};
+    "step_net", "step_scopes@GRAD", "outlinks@GRAD",     "inlinks@GRAD",
+    "memories", "pre_memories",     "boot_memories@GRAD"};
 
 RecurrentOp::RecurrentOp(const std::string& type,
                          const framework::VariableNameMap& inputs,
@@ -226,13 +225,13 @@ RecurrentGradientOp::RecurrentGradientOp(
     const framework::VariableNameMap& outputs,
     const framework::AttributeMap& attrs)
     : OperatorBase(type, inputs, outputs, attrs) {
-  rnn::InitArgument(kArgName, &arg_, *this);
+  rnn::InitArgument(kArgName, &arg_, *this, true /*is grad*/);
   alg_.Init(&arg_, &stepnet_);
 }
 
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    recurrent, paddle::operators::RecurrentOp,
-    paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
+REGISTER_OP(recurrent, paddle::operators::RecurrentOp,
+            paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker,
+            recurrent_grad, paddle::operators::RecurrentGradientOp);
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index 1033d657a3a8f96c8b3dae8dd93d3f1f6840b59b..ad4df9e55b91dbe89c34762945cd9edefde86e08 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 
 // The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
-// TODO(Yan Chunwei):
+// TODO(Superjom)
 // 1. No-padding computing for sequences with indifinite length in one batch.
 // 2. Hierarchical RNN for sequence with sub-sequence.
 // 3. Internal Memory.
@@ -177,6 +177,9 @@ class RecurrentGradientOp : public framework::OperatorBase {
 
   static const rnn::ArgumentName kArgName;
 
+  /*
+   * set a stepnet that is created according to a RecurrentOp's stepnet.
+   */
   void set_stepnet(std::unique_ptr<OperatorBase> net) {
     stepnet_ = std::move(net);
   }
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index ca7219b26d83eb6b8db75a5ed9cd360c5ac1d5df..a767009d2366e20d2ebd35f562b8df7d408f2d4e 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -109,15 +109,14 @@ void LinkMemories(const std::vector<Scope*>& scopes,
 }
 
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op) {
-  arg->step_scopes = op.Output(name.step_scopes);
-
+                  const framework::OperatorBase& op, bool is_grad) {
+  arg->step_scopes =
+      is_grad ? op.Input(name.step_scopes) : op.Output(name.step_scopes);
   arg->inlinks = op.Inputs(name.inlinks);
-
   arg->outlinks = op.Outputs(name.outlinks);
 
-  auto boot_memories = op.Inputs(name.boot_memories);
-
+  auto boot_memories =
+      is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
   // attributes
   auto memories = op.Attr<std::vector<std::string>>(name.memories);
   auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index 7dafe5d0088c4c8bf2cad163654e7e4f28eebe2e..9c777f1e9067a3e2ceb9d23f7bf7d3c73343c91f 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -78,7 +78,7 @@ void LinkMemories(const std::vector<Scope*>& step_scopes,
                   const int offset, bool infer_shape_mode);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op);
+                  const framework::OperatorBase& op, bool is_grad = false);
 
 }  // namespace rnn
 }  // namespace operators
diff --git a/paddle/operators/sequence_avg_pool_op.cc b/paddle/operators/sequence_pool_op.cc
similarity index 53%
rename from paddle/operators/sequence_avg_pool_op.cc
rename to paddle/operators/sequence_pool_op.cc
index 9815b8f3a8d813959949bbfedc79f404721a8216..73f9cb879a2ef690909428b3b672b12717a6a02c 100644
--- a/paddle/operators/sequence_avg_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_avg_pool_op.h"
+#include "paddle/operators/sequence_pool_op.h"
 
 namespace paddle {
 namespace operators {
 
-class SequenceAvgPoolOp : public framework::OperatorWithKernel {
+class SequencePoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"), "Input(X) of SequenceAvgPoolOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of SequencePoolOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(
         ctx.OutputVar("Out"),
-        "Output(Out) of SequenceAvgPoolOp should not be null.");
+        "Output(Out) of SequencePoolOp should not be null.");
 
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto dims = x->dims();
@@ -42,21 +42,45 @@ class SequenceAvgPoolOp : public framework::OperatorWithKernel {
   }
 };
 
-class SequenceAvgPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceAvgPoolOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker)
+  SequencePoolOpMaker(framework::OpProto* proto,
+                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of SequenceAvgPoolOp.");
-    AddOutput("Out", "The output of SequenceAvgPoolOp.");
+    AddInput("X",
+             "A float LoDTensor, the variable-length input of SequencePoolOp");
+    AddOutput(
+        "Out",
+        "A float LoDTensor, the variable-length output of SequencePoolOp.");
+    AddAttr<int>(
+        "strategy",
+        "(int, default AVERAGE) the pooling strategy of SequencePoolOp.")
+        .SetDefault(AVERAGE)
+        .InEnum({AVERAGE, SUM, SQRT, MAX, LAST, FIRST});
     AddComment(R"DOC(
-    SequenceAvgPoolOp averages features of all time-steps of each instance.
-    More detailed comments will be added later.
+    SequencePoolOp pools features of all time-steps of each instance.
+
+    For a mini-batch of 3 variable lengths sentences, containing 2, 3, and 2 time-steps:
+    
+    Assume X is a [7,M,N] float LoDTensor, and X->lod()[0] = [0, 2, 5, 7].
+    Besides, for the sake of simplicity, we assume M=1 and N=1, 
+    and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
+
+    Thus, Out is a [3,1,1] float LoDTensor, but Out->lod() is nullptr.
+    And for different strategy, the value of Out is as follows: 
+
+    - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+    - SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+    - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), 
+           6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+    - MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+    - LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+    - FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
     )DOC");
   }
 };
 
-class SequenceAvgPoolGradOp : public framework::OperatorWithKernel {
+class SequencePoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -84,12 +108,10 @@ class SequenceAvgPoolGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_avg_pool, ops::SequenceAvgPoolOp,
-            ops::SequenceAvgPoolOpMaker, sequence_avg_pool_grad,
-            ops::SequenceAvgPoolGradOp);
+REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
+            sequence_pool_grad, ops::SequencePoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    sequence_avg_pool,
-    ops::SequenceAvgPoolKernel<paddle::platform::CPUPlace, float>);
+    sequence_pool, ops::SequencePoolKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    sequence_avg_pool_grad,
-    ops::SequenceAvgPoolGradKernel<paddle::platform::CPUPlace, float>);
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66850772d501f873cf754205c19e9d0c0090370a
--- /dev/null
+++ b/paddle/operators/sequence_pool_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/sequence_pool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_pool, ops::SequencePoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_avg_pool_op.h b/paddle/operators/sequence_pool_op.h
similarity index 62%
rename from paddle/operators/sequence_avg_pool_op.h
rename to paddle/operators/sequence_pool_op.h
index ebe0956344eb71d0fb2836f1b4a989ac546d9f78..231614b4c1cb0eb1901b1720e933aed5cbb25f77 100644
--- a/paddle/operators/sequence_avg_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -28,54 +28,85 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+enum SeqPoolType {
+  AVERAGE = 0,
+  SUM = 1,
+  SQRT = 2,  // square_root_n
+  MAX = 3,
+  LAST = 4,
+  FIRST = 5
+};
+
 template <typename Place, typename T>
-class SequenceAvgPoolKernel : public framework::OpKernel {
+class SequencePoolKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
+    int strategy = context.Attr<int>("strategy");
 
     auto dims = in->dims();
-    auto lod = in->lod();
+    auto lod = in->lod()[0];
     int64_t w = in->numel() / dims[0];
 
     out->mutable_data<T>(context.GetPlace());
     auto place = context.GetEigenDevice<Place>();
-    for (int i = 0; i < static_cast<int>(lod[0].size()) - 1; ++i) {
-      Tensor in_t = in->Slice<T>(static_cast<int>(lod[0][i]),
-                                 static_cast<int>(lod[0][i + 1]));
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      Tensor in_t =
+          in->Slice<T>(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
       Tensor out_t = out->Slice<T>(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[0][i + 1] - lod[0][i]);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
-      out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+
+      switch (strategy) {
+        case AVERAGE:
+          out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+          break;
+        case SUM:
+          out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+          break;
+        default:
+          PADDLE_THROW("unsupported pooling strategy");
+      }
     }
   }
 };
 
 template <typename Place, typename T>
-class SequenceAvgPoolGradKernel : public framework::OpKernel {
+class SequencePoolGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
     auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    int strategy = context.Attr<int>("strategy");
 
     auto dims = in->dims();
-    auto lod = in->lod();
+    auto lod = in->lod()[0];
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
     auto place = context.GetEigenDevice<Place>();
-    for (int i = 0; i < static_cast<int>(lod[0].size()) - 1; ++i) {
-      auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[0][i]),
-                                   static_cast<int>(lod[0][i + 1]));
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[i]),
+                                   static_cast<int>(lod[i + 1]));
       auto out_g_t = out_g->Slice<T>(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[0][i + 1] - lod[0][i]);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
       auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
       Eigen::DSizes<int, 2> bcast(h, 1);
-      in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+
+      switch (strategy) {
+        case AVERAGE:
+          in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+          break;
+        case SUM:
+          in_g_e.device(place) = (out_g_e).broadcast(bcast);
+          break;
+        default:
+          PADDLE_THROW("unsupported pooling strategy");
+      }
     }
   }
 };
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index afe4d149c53819c45e20353bc9d16393f3f61e0f..53fe505b77bfac8a33803f082f8e935d3ed403b6 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -301,14 +301,16 @@ class TopkOpCUDAKernel : public framework::OpKernel {
 
     // NOTE: pass lds and dim same to input width.
     // NOTE: old matrix implementation of stride is different to eigen.
-    // TODO(typhoonzero): launch kernel on specified stream.
     // TODO(typhoonzero): refine this kernel.
     dim3 threads(256, 1);
     dim3 grid(input_height, 1);
 
-    KeMatrixTopK<T, 5, 256><<<grid, threads>>>(
-        output_data, output->dims()[1], indices_data, input_data, input_width,
-        input_width, int(k));
+    KeMatrixTopK<T, 5, 256><<<
+        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(output_data, output->dims()[1],
+                                           indices_data, input_data,
+                                           input_width, input_width, int(k));
   }
 };
 
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index caa78acd98ea4b35fc69643689cfce23026275e0..895e8d6a63d1fad0ee7a6f5647402435d418b2f1 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "ParameterOptimizer.h"
+#include "ParameterUpdateFunctions.h"
 #include "Regularizer.h"
 
 namespace paddle {
@@ -37,6 +38,15 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
+#ifdef PADDLE_USE_MKLDNN
+    sgdUpdate(learningRate_ * paraConfig.learning_rate() *
+                  (firstTime_ ? 1.0 : torch_learningRate),
+              paraConfig.momentum(),
+              applyDecay_ ? paraConfig.decay_rate() : 0,
+              vecs[PARAMETER_VALUE].get(),
+              vecs[PARAMETER_GRADIENT].get(),
+              vecs[PARAMETER_MOMENTUM].get());
+#else
     vecs[PARAMETER_VALUE]->sgdUpdate(
         *vecs[PARAMETER_GRADIENT],
         *vecs[PARAMETER_MOMENTUM],
@@ -44,6 +54,7 @@ public:
             (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
+#endif
   }
   virtual void finishBatch() { firstTime_ = false; }
 };
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index c8af7105c78dcbf9f625a348b7f38efcf278469e..8b3be062b654a52e667626199be8c8bb4a2a96d7 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,6 +30,9 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
+#ifdef PADDLE_USE_MKLDNN
+#pragma omp parallel for
+#endif
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
                      decayRate * value[i];
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index a106592e454e21c46cd2f87f1bbf6694955d6e23..f6a39a8e26c301296aac0af7f4e8b2c6c97ece24 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -34,13 +34,14 @@ class DeviceContext {
 
   template <typename DeviceType>
   DeviceType* get_eigen_device() const;
+
+  virtual void Wait() const {}
 };
 
 class CPUDeviceContext : public DeviceContext {
  public:
   CPUDeviceContext();
   explicit CPUDeviceContext(CPUPlace place);
-  virtual ~CPUDeviceContext() {}
 
   Eigen::DefaultDevice* eigen_device() const;
 
@@ -59,7 +60,7 @@ class CUDADeviceContext : public DeviceContext {
   virtual ~CUDADeviceContext();
 
   /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const;
+  void Wait() const override;
 
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index ed2420b8740e583d307f6836a70fe7e1c780e28b..f0c825bd9b0bc41396b8fdb95f0b4337cbe3db02 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -36,7 +36,7 @@ int GetCurrentDeviceId();
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);
 
-//！Get the memory usage of current GPU device.
+//! Get the memory usage of current GPU device.
 void GpuMemoryUsage(size_t &available, size_t &total);
 
 //! Get the maximum allocation size of current GPU device.
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c7009a604f60cda11434ad33b6c7d7caee1befdd..25e290ffbb94354da3393ca0b769aff512d74a41 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -34,12 +34,7 @@ limitations under the License. */
 namespace py = pybind11;
 
 namespace paddle {
-namespace framework {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
+namespace pybind {
 static size_t UniqueIntegerGenerator() {
   static std::atomic<size_t> generator;
   return generator.fetch_add(1);
@@ -56,6 +51,10 @@ bool IsCompileGPU() {
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
+  // using framework in this function. Since it is inside a function, it will
+  // not cause namespace pollution.
+  using namespace paddle::framework;  // NOLINT
+
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
@@ -107,7 +106,7 @@ PYBIND11_PLUGIN(core) {
 #ifdef PADDLE_ONLY_CPU
             new (&instance) LoDTensor(lod);
 #else
-             paddle::framework::LoD new_lod;
+             LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              new (&instance) LoDTensor(new_lod);
@@ -118,7 +117,7 @@ PYBIND11_PLUGIN(core) {
 #ifdef PADDLE_ONLY_CPU
              self.set_lod(lod);
 #else
-             paddle::framework::LoD new_lod;
+             LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              self.set_lod(new_lod);
@@ -132,7 +131,7 @@ PYBIND11_PLUGIN(core) {
            std::vector<std::vector<size_t>> new_lod;
            new_lod.reserve(lod.size());
            std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
-               [](paddle::framework::Vector<size_t> item) ->
+               [](Vector<size_t> item) ->
                    std::vector<size_t> {
                  std::vector<size_t> v;
                  v.reserve(item.size());
@@ -238,7 +237,13 @@ All parameter, weight, gradient are variables in Paddle.
              return Backward(forwardOp, no_grad_vars).release();
            })
       .def("infer_shape", &OperatorBase::InferShape)
-      .def("run", &OperatorBase::Run)
+      .def("run",
+           [](OperatorBase &self,
+              const Scope &scope,
+              const platform::DeviceContext &dev_ctx) {
+             self.Run(scope, dev_ctx);
+             dev_ctx.Wait();
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -317,5 +322,5 @@ All parameter, weight, gradient are variables in Paddle.
 
   return m.ptr();
 }
-}  // namespace framework
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 95171acf729a513e5c92d1e0cba15cb12b38561a..bcfba84a1aa6e646cf255dc4612dfda42169fc44 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -23,7 +23,7 @@ namespace py = pybind11;
 
 namespace paddle {
 
-namespace framework {
+namespace pybind {
 
 namespace details {
 
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index f01ad4142d4fe7c7f7d7aac60d967ea114b93e56..066837ca959e46dbe3b39c661aa1bab11cbf2734 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -37,6 +37,19 @@ add_test(NAME test_CompareTwoNets
             --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
+################ test_CompareMKLDNNandCPU ######################
+if(WITH_MKLDNN)
+  add_unittest_without_exec(test_CompareMKLDNNandCPU
+      test_CompareTwoNets.cpp)
+  add_test(NAME test_CompareMKLDNNandCPU
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
+              --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True
+              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
+              --use_gpu=False
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+
 ############### test_CompareTwoOpts ###################
 add_unittest_without_exec(test_CompareTwoOpts
     test_CompareTwoOpts.cpp)
diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
new file mode 100644
index 0000000000000000000000000000000000000000..77f78161535c49da4ef7fc1563cff58c021aecef
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
@@ -0,0 +1,63 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 1000,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+            
+tmp = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+            
+tmp = fc_layer(input=tmp, size=64,
+               bias_attr=True,
+               act=ReluActivation())
+
+output = fc_layer(input=tmp, size=10,
+                  bias_attr=True,
+                  act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index 94f65e545d116c802fb4877dc14f07aaaf83a4fb..307645d2c3d21d954371fcedb5f95a2536a0183e 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -26,12 +26,15 @@ DECLARE_int32(gpu_id);
 
 DECLARE_bool(local);
 DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
 
 DECLARE_string(config);
 DECLARE_string(nics);
 
 DEFINE_string(config_file_a, "", "config of one network to compare");
 DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(use_mkldnn_a, false, "whether to use mkldnn to run config_file_a");
+DEFINE_bool(use_mkldnn_b, false, "whether to use mkldnn to run config_file_b");
 DEFINE_bool(need_high_accuracy,
             false,
             "whether need to run in double accuracy");
@@ -128,6 +131,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
                 matA.getWidth());
   }
 
+  if (FLAGS_use_mkldnn_a || FLAGS_use_mkldnn_b) {
+    // some format of mkldnn parameter is different with cpu
+    // test_MKLDNN will check the parameters
+    return;
+  }
+
   vector<ParameterPtr>& parametersA = comDataA.parameters;
   vector<ParameterPtr>& parametersB = comDataB.parameters;
 
@@ -167,10 +176,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
 TEST(Trainer, create) {
   ComData dataA;
+  FLAGS_use_mkldnn = FLAGS_use_mkldnn_a;
   calcGradient(dataA, FLAGS_config_file_a);
   LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
 
   ComData dataB;
+  FLAGS_use_mkldnn = FLAGS_use_mkldnn_b;
   calcGradient(dataB, FLAGS_config_file_b);
   LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
 
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 0f57b81966647ca5c6f5cd2e5518d2d34942a549..098a51ab8791290d3e0ffa2c3703c724dd2387b9 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1566,7 +1566,7 @@ class LayerBase(object):
         self.config = g_config.model_config.layers.add()
         assert isinstance(self.config, LayerConfig)
         use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        mkldnn_acts = ['relu', 'tanh']
+        mkldnn_acts = ['relu', 'tanh', 'softmax']
         if use_mkldnn and active_type in mkldnn_acts:
             active_type = "mkldnn_" + active_type
         self.config.name = name
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index c97e6c0a36774caaa4fd8f8130220849975451a0..74025d2a7bb68f87afd24bb4b70ec425ba0dcb64 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -921,7 +921,7 @@ def data_layer(name, size, depth=None, height=None, width=None,
 
         data = data_layer(name="input", size=1000)
 
-    :param name: The name of this layer. It is optional.
+    :param name: The name of this layer.
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
@@ -3668,6 +3668,7 @@ def gru_step_naive_layer(input,
     :param param_attr:
     :param layer_attr:
     :return:
+    :rtype: LayerOutput
     """
     if input.size % 3 != 0:
         raise ValueError("GruStep input size must be divided by 3")
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 6cca41e43b38b8cccb65ff9b347ef226dddecd4d..9086a5cc3452b178ec37fe6a3e358eaa4c5d606b 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -89,12 +89,16 @@ class OpDescCreationMethod(object):
                     new_attr.f = user_defined_attr
                 elif attr.type == framework_pb2.STRING:
                     new_attr.s = user_defined_attr
+                elif attr.type == framework_pb2.BOOLEAN:
+                    new_attr.b = user_defined_attr
                 elif attr.type == framework_pb2.INTS:
                     new_attr.ints.extend(user_defined_attr)
                 elif attr.type == framework_pb2.FLOATS:
                     new_attr.floats.extend(user_defined_attr)
                 elif attr.type == framework_pb2.STRINGS:
                     new_attr.strings.extend(user_defined_attr)
+                elif attr.type == framework_pb2.BOOLEANS:
+                    new_attr.bools.extend(user_defined_attr)
                 elif attr.type == framework_pb2.INT_PAIRS:
                     for p in user_defined_attr:
                         pair = new_attr.int_pairs.add()
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
index d314ce391ea2f10a8bd77c24e84fa3e1eebb6c73..47557ccb41d1e835b5d04d1b94f54dfc7aa2855a 100644
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -24,15 +24,15 @@ class TestCosSimOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+            ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y'))
 
 
 class TestCosSimOp2(TestCosSimOp):
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 0206ca064be87afe204aa99021979b7ddc3c5d63..1de514dff487158e0823fd628d9b3b50f36fdd9b 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -4,22 +4,24 @@ from op_test import OpTest
 
 
 class TestCrossEntropyOp1(OpTest):
-    """Test standard cross-entropy, with index representation of labels.
+    """Test cross-entropy with discrete one-hot labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
         batch_size = 30
         class_num = 10
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label = np.random.randint(0, class_num, (batch_size, 1), dtype="int32")
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])],
             dtype="float32")
+
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {'soft_label': 0}
+        self.attrs = {"softLabel": False}
 
     def test_check_output(self):
         self.check_output()
@@ -29,13 +31,14 @@ class TestCrossEntropyOp1(OpTest):
 
 
 class TestCrossEntropyOp2(OpTest):
-    """Test soft-label cross-entropy, with vecterized soft labels.
+    """Test cross-entropy with vectorized soft labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 10
-        class_num = 5
+        batch_size = 5
+        class_num = 37
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label = np.random.uniform(0.1, 1.0,
@@ -43,46 +46,49 @@ class TestCrossEntropyOp2(OpTest):
         label /= label.sum(axis=1, keepdims=True)
         cross_entropy = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y", max_relative_error=0.05)
 
 
 class TestCrossEntropyOp3(OpTest):
-    """Test one-hot cross-entropy, with vecterized one-hot representation of
-    labels.
+    """Test cross-entropy with vectorized one-hot representation of labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 30
-        class_num = 10
+        batch_size = 5
+        class_num = 17
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label_index = np.random.randint(
             0, class_num, (batch_size), dtype="int32")
         label = np.zeros(X.shape)
         label[np.arange(batch_size), label_index] = 1
+
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])],
             dtype="float32")
         cross_entropy2 = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y", max_relative_error=0.05)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_dropout_op.py b/python/paddle/v2/framework/tests/test_dropout_op.py
index 3638fee1a1c26195791bc1f5a46dd749da0aee95..29fc702791184aaacf335e13bcc6d03082bb49a6 100644
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
@@ -7,7 +7,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
         self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64))}
 
     def test_check_output(self):
@@ -21,7 +21,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 1.0, 'is_training': True}
         self.outputs = {'Out': np.zeros((32, 64)), 'Mask': np.zeros((32, 64))}
 
 
@@ -29,7 +29,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
         self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64, 2))}
 
 
@@ -37,7 +37,7 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_training': 0}
+        self.attrs = {'dropout_prob': 0.35, 'is_training': False}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
@@ -48,7 +48,7 @@ class TestDropoutOp5(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.75, 'is_training': 0}
+        self.attrs = {'dropout_prob': 0.75, 'is_training': False}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3101a709b8bcf58e8682ab3d0ca5217a7f3572d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseAddOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_div_op.py b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..41cb2b7767eb8e01e46e770a5da21b609f4eb911
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
@@ -0,0 +1,105 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class ElementwiseDivOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        """ Warning
+        CPU gradient check error!
+        'X': np.random.random((32,84)).astype("float32"),
+        'Y': np.random.random((32,84)).astype("float32")
+        """
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+
+
+class TestElementwiseDivOp_Vector(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [32]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [32]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2]).astype("float32")
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [4]).astype("float32")
+        }
+
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3, 4]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
index e268cfddb26721a35ddd2d2cc18f526ff7b2f6d9..cee4385a8176f7a441a280e3cd40c39ca51493c5 100644
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
@@ -3,14 +3,9 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestElementwiseMulOp_Matrix(OpTest):
+class ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
-        """ Warning
-        CPU gradient check error!
-        'X': np.random.random((32,84)).astype("float32"),
-        'Y': np.random.random((32,84)).astype("float32")
-        """
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
             'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
@@ -32,7 +27,7 @@ class TestElementwiseMulOp_Matrix(OpTest):
             ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
 
 
-class TestElementwiseMulOp_Vector(OpTest):
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -41,22 +36,8 @@ class TestElementwiseMulOp_Vector(OpTest):
         }
         self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_0(OpTest):
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -69,22 +50,8 @@ class TestElementwiseMulOp_broadcast_0(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(2, 1, 1)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_1(OpTest):
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -97,22 +64,8 @@ class TestElementwiseMulOp_broadcast_1(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 3, 1)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_2(OpTest):
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -124,22 +77,8 @@ class TestElementwiseMulOp_broadcast_2(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 4)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_3(OpTest):
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
diff --git a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..be982e8c57b30b91c2834bd5db38ea3c89f573ee
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseSubOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ce65bfc31d9fa2d3988759a197e2f497b8161b1
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
@@ -0,0 +1,38 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sigmoid_np(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def tanh_np(x):
+    return 2 * sigmoid_np(2. * x) - 1.
+
+
+class LstmUnitTest(OpTest):
+    def setUp(self):
+        self.op_type = "lstm_unit"
+        x_np = np.random.normal(size=(5, 16)).astype("float32")
+        c_np = np.random.normal(size=(5, 4)).astype("float32")
+        i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
+        forget_bias_np = 0.
+        self.attrs = {'forget_bias': 0.}
+
+        new_c = c_np * sigmoid_np(f_np + forget_bias_np) + sigmoid_np(
+            i_np) * tanh_np(j_np)
+        new_h = tanh_np(new_c) * sigmoid_np(o_np)
+
+        self.inputs = {'X': x_np, 'C_prev': c_np}
+        self.outputs = {'C': new_c, 'H': new_h}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'C_prev'], ['C', 'H'], max_relative_error=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_multiplex_op.py b/python/paddle/v2/framework/tests/test_multiplex_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2b3881cde24c7fb96c3d7f9411352bc62d55077
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
@@ -0,0 +1,43 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMultiplexOp(OpTest):
+    def setUp(self):
+        self.op_type = "multiplex"
+        rows = 3
+        index = np.array([3, 1, 0])
+        ins1 = np.random.random((rows, 10)).astype("float32")
+        ins2 = np.random.random((rows, 10)).astype("float32")
+        ins3 = np.random.random((rows, 10)).astype("float32")
+        ins4 = np.random.random((rows, 10)).astype("float32")
+        self.inputs = {
+            'X': [('index', index), ('x1', ins1), ('x2', ins2), ('x3', ins3),
+                  ('x4', ins4)]
+        }
+        # multiplex output
+        output = np.zeros_like(ins1)
+        for i in range(0, rows):
+            k = index[i] + 1
+            output[i] = self.inputs['X'][k][1][i]
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x1', 'x2', 'x3', 'x4'], 'Out')
+
+    def test_check_grad_ignore_x1(self):
+        self.check_grad(['x2', 'x3', 'x4'], 'Out', no_grad_set=set('x1'))
+
+    def test_check_grad_ignore_x1_x2(self):
+        self.check_grad(['x3', 'x4'], 'Out', no_grad_set=set(['x1', 'x2']))
+
+    def test_check_grad_ignore_x3(self):
+        self.check_grad(['x1', 'x2', 'x4'], 'Out', no_grad_set=set('x3'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_prelu_op.py b/python/paddle/v2/framework/tests/test_prelu_op.py
index 2b6b7db36808a4b68c55328a1eb9ac212c18b678..676fd9f7c555fd5c8544e760345ab954cd137dc5 100644
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
@@ -7,6 +7,14 @@ class PReluTest(OpTest):
     def setUp(self):
         self.op_type = "prelu"
         x_np = np.random.normal(size=(10, 10)).astype("float32")
+
+        for pos, val in np.ndenumerate(x_np):
+            # Since zero point in prelu is not differentiable, avoid randomize
+            # zero.
+            while abs(val) < 1e-3:
+                x_np[pos] = np.random.normal()
+                val = x_np[pos]
+
         x_np_sign = np.sign(x_np)
         x_np = x_np_sign * np.maximum(x_np, .005)
         alpha_np = np.array([.1])
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 79eda70021b76cd06e4c40740b1ca49476f4c503..cc3d4776e26a9dcaf9cf8403e0a1d0fca1d2ebae 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -3,6 +3,7 @@ import paddle.v2.framework.core as core
 import unittest
 import numpy as np
 from paddle.v2.framework.op import Operator, RecurrentOp
+from op_test import get_numeric_gradient
 
 
 def py_sigmoid(x):
@@ -47,7 +48,7 @@ class PySimpleRNN(object):
         else:
             pre_mem = self.h_boot
         xW = np.matmul(x, self.W)
-        hU = np.matmul(mem, self.U)
+        hU = np.matmul(pre_mem, self.U)
 
         sum = xW + hU
         self.mems[step_id] = py_sigmoid(sum)
@@ -68,7 +69,7 @@ def create_tensor(scope, name, shape, np_data):
     return tensor
 
 
-class TestRecurrentOp(unittest.TestCase):
+class RecurrentOpTest(unittest.TestCase):
     '''
     Test RNNOp
 
@@ -158,6 +159,42 @@ class TestRecurrentOp(unittest.TestCase):
         print
         print 'py_output', py_output
         self.assertEqual(pd_output.shape, py_output.shape)
+        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
+
+
+class RecurrentGradientOpTest(unittest.TestCase):
+    def create_forward_op(self):
+        self.forward_op = RecurrentOp(
+            # inputs
+            inlinks=["x"],
+            boot_memories=["h_boot"],
+            step_net="stepnet",
+            # outputs
+            outlinks=["h"],
+            step_scopes="step_scopes",
+            # attributes
+            pre_memories=["h@pre"],
+            memories=["h@alias"])
+
+        # create a stepnet for RNN
+        stepnet = core.Net.create()
+        x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@alias")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            stepnet.append_op(op)
+        stepnet.complete_add_op(True)
+        self.forward_op.set_stepnet(stepnet)
+
+    def create_gradient_op(self):
+        a = set()
+        backward_op = core.RecurrentOp.backward(self.forward_op, a)
+
+    def test_grad(self):
+        self.create_forward_op()
+        self.create_gradient_op()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index cf864936af6361da1f16df3cfb759b468214b970..211086e5f4de32b996f0fa27c2eb52670c2b1e11 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -3,20 +3,37 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestSeqAvgPool1D(OpTest):
-    def setUp(self):
-        self.op_type = 'sequence_avg_pool'
+class SeqPoolType(OpTest):
+    AVERAGE = 0
+    SUM = 1
+    SQRT = 2
+    MAX = 3
+    LAST = 4
+    FIRST = 5
+
+
+class TestSeqAvgPool(OpTest):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
         lod = [[0, 4, 5, 8, 11]]
+        self.inputs = {'X': (x, lod)}
 
         out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.AVERAGE}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
         for i in range(4):
             sub_x = x[lod[0][i]:lod[0][i + 1], :]
             out[i] = sub_x.mean(axis=0)
 
-        self.inputs = {'X': (x, lod)}
-        self.outputs = {'Out': out}
+    def setUp(self):
+        self.set_data()
+        self.compute()
 
     def test_check_output(self):
         self.check_output()
@@ -25,26 +42,44 @@ class TestSeqAvgPool1D(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestSeqAvgPool2D(OpTest):
-    def setUp(self):
-        self.op_type = 'sequence_avg_pool'
+class TestSeqAvgPool2D(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
         lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
 
         out = np.zeros((4, 3, 17)).astype('float32')
+        self.outputs = {'Out': out}
+
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.AVERAGE}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
         for i in range(4):
             sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
             out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
-        self.inputs = {'X': (x, lod)}
-        self.outputs = {'Out': out}
 
-    def test_check_output(self):
-        self.check_output()
+class TestSeqSumPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SUM}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x.sum(axis=0)
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+
+class TestSeqSumPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SUM}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
 if __name__ == '__main__':