diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d2aab938d4636b1583062e27b73cb30f5d56b7b0..0bbf92293168d4e3af2c1ed0e82b75e6a8d6c0cd 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
   endforeach()
   list(REMOVE_DUPLICATES libs_deps)
 
-  if(APPLE) # Use OSX's libtool to merge archives
-    # To produce a library we need at least one source file.
-    # It is created by add_custom_command below and will helps
-    # also help to track dependencies.
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
+  if(APPLE) # Use OSX's libtool to merge archives
     # Make the generated dummy source file depended on all static input
     # libs. If input lib changes,the source file is touched
     # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${dummyfile}
-      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
       DEPENDS ${libs})
 
     # Generate dummy staic lib
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
     foreach(lib ${libs})
@@ -130,11 +130,14 @@ function(merge_static_libs TARGET_NAME)
     endforeach()
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      )
   else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
+
     foreach(lib ${libs})
-      set(objlistfile ${lib}.objlist) # list of objects in the input library
-      set(objdir ${lib}.objdir)
+      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
+      set(objdir ${target_DIR}/${lib}.objdir)
 
       add_custom_command(OUTPUT ${objdir}
         COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
@@ -142,31 +145,32 @@ function(merge_static_libs TARGET_NAME)
 
       add_custom_command(OUTPUT ${objlistfile}
         COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library		
-      set(mergebase ${lib}.mergebase.c)		
-      add_custom_command(OUTPUT ${mergebase}		
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
-        DEPENDS ${objlistfile})		
-
-      list(APPEND mergebases "${mergebase}")
+      list(APPEND target_OBJS "${objlistfile}")
     endforeach()
 
-    add_library(${TARGET_NAME} STATIC ${mergebases})
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
     # Get the file name of the generated library
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
 
-    foreach(lib ${libs})
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
-        COMMAND ${CMAKE_RANLIB} ${outlibfile}
-        WORKING_DIRECTORY ${lib}.objdir)
-    endforeach()
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
+        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
+        WORKING_DIRECTORY ${target_DIR})
   endif()
 endfunction(merge_static_libs)
 
@@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
     add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
 
   else(cc_library_SRCS)
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
     else()
       message(FATAL "Please specify source file or library in cc_library.")
diff --git a/cmake/util.cmake b/cmake/util.cmake
index e814cad36f2a8ce95a2dc9fabc35cb39506d4cd7..ac911052eb970c5a3e485e3178dd788b1517ca30 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -25,7 +25,7 @@ function(target_circle_link_libraries TARGET_NAME)
             endif()
         endforeach()
         if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            if(IOS AND NOT IOS_ENABLE_BITCODE)
+            if(NOT IOS_ENABLE_BITCODE)
                 list(APPEND LIBS "-undefined dynamic_lookup")
             endif()
         endif()
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 00192aa69bd487787a8743d5589a365eacbd4ff3..acbf4c87ae5242f6cfc593a7fddc649ee3a70d7c 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -158,17 +158,23 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
-7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+7. paddlepaddle\*.whl is not a supported wheel on this platform.
 ------------------------------------------------------------------------
 
-出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
-而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
+
 更新 :code:`pip` 包的方法是\:
 
 ..  code-block:: bash
 
     pip install --upgrade pip
 
+如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
+并对比是否和正在安装的后缀一致。
+
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
+如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
+
 8.  python相关的单元测试都过不了
 --------------------------------
 
@@ -310,7 +316,7 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 模型一直不收敛，发散到了一个数值特别大的地方。
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
-主要的解决办法是减小学习律或者对数据进行归一化处理。
+主要的解决办法是减小学习率或者对数据进行归一化处理。
 
 15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
 ------------------------------------------------------------------------
@@ -373,3 +379,15 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
 
     parameters = paddle.parameters.create(my_cost)
     parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+18. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------
+
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
\ No newline at end of file
diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..1cd9962700bb49866f1ed6987abc28b27888a23f
--- /dev/null
+++ b/doc/survey/cluster_bootstrapping_tools.md
@@ -0,0 +1,71 @@
+# Cluster bootstrapping tool survey
+## Abstract
+In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
+
+## Basic assumptions
+Here are some basic assumptions before we move on to  details
+1. You are an administrator of a bare metal machine cluster, which means:
+  * you have full control to each of the machines.
+  * you have full control to the network which machines are connected to.
+2. Machines can be booted from network with PEX or iPXE
+3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
+
+if your cluster is able to mark above items with checkmarks, then keep reading.
+
+## Comparing Sextant and Tectonic installer
+### Sextant
+Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. 
+
+#### Pros
+1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
+2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
+3. docker registry integrated.
+4. GPU machine took care of.
+
+### Cons
+1. k8s API server is not deployed with high availability in considering by default.
+2. No grouping support.
+3. No API interface, a one-off service.
+
+
+### Tectonic installer
+First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
+
+Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, 
+Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
+
+Matchbox's Approach is similar to Sexstant.
+
+### Pros
+1. supports grouping machines.
+2. supports running provisioning service in rtk. (not a big deal though).
+3. supports http/gRPC API interface.
+4. supports multi-template.
+
+### Cons
+1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
+2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
+
+## Conclusion
+Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
+
+
+
+## Appendix: General procedure to bring up a cluster
+It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
+1. setup a bootstrap machine with static IP in the cluster, which has following services:
+  * DHCP: assigns ip address for rest of the nodes.
+  * name service: to map node name to a IP
+  * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. 
+  * cluster config service: this is for providing cluster node with OS config via http
+  * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
+2. New node powers on, it will
+  * broadcast the request for an IP address
+  * DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
+  * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
+  * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
+  * then restart the node.
+
+For further understanding, following 2 links from Matchbox are some good readings:
+* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
+* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index dd9e4f1cbd636e29a6934d1119fc93ebc9d0ecee..b9bbe58951c643f1b1649858880fbd2ba3a2a7b7 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -62,6 +62,7 @@ if(ANDROID)
           LIBRARY DESTINATION lib/${ANDROID_ABI})
   execute_process(
     COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_COMMITS_LIST
     RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -81,8 +82,7 @@ if(ANDROID)
       )"
   )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole
-          ARCHIVE DESTINATION lib)
+  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
   if(NOT IOS)
     install(TARGETS paddle_capi_shared DESTINATION lib)
   endif()
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index 27132eaa0b3b0666fc042faf052dac2e169ba9e7..fda89252e35c382468877e8cab148e5f91d77ac2 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,6 +19,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+static ProgramDesc* g_program_desc = nullptr;
+
+ProgramDesc& GetProgramDesc() {
+  if (g_program_desc == nullptr) {
+    g_program_desc = new ProgramDesc();
+  }
+  return *g_program_desc;
+}
+
+template <>
+AttrType AttrTypeID<bool>() {
+  return BOOLEAN;
+}
 template <>
 AttrType AttrTypeID<int>() {
   return INT;
@@ -32,6 +45,10 @@ AttrType AttrTypeID<std::string>() {
   return STRING;
 }
 template <>
+AttrType AttrTypeID<std::vector<bool>>() {
+  return BOOLEANS;
+}
+template <>
 AttrType AttrTypeID<std::vector<int>>() {
   return INTS;
 }
@@ -47,40 +64,54 @@ template <>
 AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
   return INT_PAIRS;
 }
+template <>
+AttrType AttrTypeID<BlockDesc>() {
+  return BLOCK;
+}
 
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
-    case paddle::framework::AttrType::INT: {
+    case framework::AttrType::BOOLEAN: {
+      return attr_desc.b();
+    }
+    case framework::AttrType::INT: {
       return attr_desc.i();
     }
-    case paddle::framework::AttrType::FLOAT: {
+    case framework::AttrType::FLOAT: {
       return attr_desc.f();
     }
-    case paddle::framework::AttrType::STRING: {
+    case framework::AttrType::STRING: {
       return attr_desc.s();
     }
-    case paddle::framework::AttrType::INTS: {
+    case framework::AttrType::BOOLEANS: {
+      std::vector<bool> val(attr_desc.bools_size());
+      for (int i = 0; i < attr_desc.bools_size(); ++i) {
+        val[i] = attr_desc.bools(i);
+      }
+      return val;
+    }
+    case framework::AttrType::INTS: {
       std::vector<int> val(attr_desc.ints_size());
       for (int i = 0; i < attr_desc.ints_size(); ++i) {
         val[i] = attr_desc.ints(i);
       }
       return val;
     }
-    case paddle::framework::AttrType::FLOATS: {
+    case framework::AttrType::FLOATS: {
       std::vector<float> val(attr_desc.floats_size());
       for (int i = 0; i < attr_desc.floats_size(); ++i) {
         val[i] = attr_desc.floats(i);
       }
       return val;
     }
-    case paddle::framework::AttrType::STRINGS: {
+    case framework::AttrType::STRINGS: {
       std::vector<std::string> val(attr_desc.strings_size());
       for (int i = 0; i < attr_desc.strings_size(); ++i) {
         val[i] = attr_desc.strings(i);
       }
       return val;
     }
-    case paddle::framework::AttrType::INT_PAIRS: {
+    case framework::AttrType::INT_PAIRS: {
       std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
       for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
         val[i].first = attr_desc.int_pairs(i).first();
@@ -88,6 +119,9 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
       }
       return val;
     }
+    case framework::AttrType::BLOCK: {
+      return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
+    }
   }
   PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
   return boost::blank();
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 2b788a76cafe198abb9aed8ba842e37cc6ff73a6..48b54b5422de8c45e15a1b7040b78373dce8fa3a 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,13 +27,16 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>,
-                       std::vector<std::pair<int, int>>>
+typedef boost::variant<boost::blank, bool, int, float, std::string,
+                       std::vector<bool>, std::vector<int>, std::vector<float>,
+                       std::vector<std::string>,
+                       std::vector<std::pair<int, int>>, BlockDesc*>
     Attribute;
 
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
 
+ProgramDesc& GetProgramDesc();
+
 template <typename T>
 AttrType AttrTypeID();
 
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index c5d46622156c56acb98fb77e7db5ee7bca8c937a..0ec18de5b8a0e7cebdb91c30d2b45596b02dfa51 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -166,9 +166,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
 
         // If part of input gradient of that operator is not calculated, fill
         // zero variables to that input gradient.
-        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
-                                           {{"Src", {prefix}}},
-                                           {{"Dst", {grad_input}}}, {}));
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
+                                           {{"Y", {grad_input}}}, {}));
       }
       return false;
     });
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index ad8003420dc14538d0dae9a1cb19d6459b154576..6932f5b989a3e21ebc44ec4fec9f5223f2547d7a 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -127,8 +127,8 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
  public:
   FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "x");
-    AddOutput("Dst", "out");
+    AddInput("X", "x");
+    AddOutput("Y", "out");
     AddComment("");
   }
 };
@@ -325,10 +325,10 @@ TEST(Backward, op_part_of_output_are_not_need) {
 
   auto &fill_zero = *net->ops_[0];
   ASSERT_EQ("fill_zeros_like", fill_zero.Type());
-  ASSERT_EQ(1UL, fill_zero.Inputs("Src").size());
-  ASSERT_EQ("Z", fill_zero.Input("Src"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Dst").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Dst"));
+  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
+  ASSERT_EQ("Z", fill_zero.Input("X"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Y").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y"));
 
   auto &d_many_out = *net->ops_[1];
   ASSERT_EQ("many_output_op_grad", d_many_out.Type());
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index dfcb5fb6210a08f35193b83e3b5f7cee92f618d7..6fcfe6de25737b66a2ea6c1a438636f072a513bb 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -23,6 +23,9 @@ enum AttrType {
   FLOATS = 4;
   STRINGS = 5;
   INT_PAIRS = 6;
+  BOOLEAN = 7;
+  BOOLEANS = 8;
+  BLOCK = 9;
 }
 
 message IntPair {
@@ -44,6 +47,9 @@ message OpDesc {
     repeated float floats = 7;
     repeated string strings = 8;
     repeated IntPair int_pairs = 9;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
   };
 
   message Var {
@@ -100,7 +106,7 @@ enum DataType {
 
 message LoDTensorDesc {
   required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
   optional int32 lod_level = 3 [ default = 0 ];
 }
 
@@ -108,3 +114,12 @@ message VarDesc {
   required string name = 1;
   optional LoDTensorDesc lod_tensor = 2;
 }
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+}
+
+message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 908a1f2fd0abe0aa4016c72dbcbc18dcc144232c..3c349637cdbe59b2cf9a1ea28e7715f4181f9293 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -72,20 +72,16 @@ bool operator==(const LoD& a, const LoD& b) {
   return true;
 }
 
-void LoDTensor::SliceLevels(size_t level_begin, size_t level_end) {
+void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
   auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
   lod_ = new_lod;
 }
 
-void LoDTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
-  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
-                 NumLevels());
-  PADDLE_ENFORCE(elem_begin < NumElements(level),
-                 "element begin [%d] out of range [%d]", elem_begin,
-                 NumElements(level));
-  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
-                 "element end [%d] out of range [%d]", elem_end,
-                 NumElements(level));
+void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
+                              size_t elem_end) {
+  PADDLE_ENFORCE_LT(level, NumLevels());
+  PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
+  PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
 
   auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
   lod_ = new_lod;
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index fac5cd20aa7f9db0792f8102bb442192ab1ad63f..82f58464264c6871b51251e0feae3d5ca076cd2b 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -89,15 +89,15 @@ class LoDTensor : public Tensor {
   }
 
   /*
-   * Slice of levels[level_begin:level_end]
+   * Shrink levels[level_begin:level_end]
    */
-  void SliceLevels(size_t level_begin, size_t level_end);
+  void ShrinkLevels(size_t level_begin, size_t level_end);
 
   /*
-   * Slice of elements of a level, [elem_begin: elem_end]
+   * Shrink elements of a level, [elem_begin: elem_end]
    * @note: low performance in slice lod_.
    */
-  void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
+  void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
  private:
   LoD lod_;
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 7915326b27a22e9280e3f09d9bbfc2a58f46aff7..486b839738ec077545163bc47e6a97ef188c3c2f 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -56,11 +56,11 @@ TEST_F(LoDTensorTester, NumElements) {
   ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }
 
-TEST_F(LoDTensorTester, SliceLevels) {
+TEST_F(LoDTensorTester, ShrinkLevels) {
   // slice 1 level
   for (size_t level = 0; level < 3UL; ++level) {
     LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.SliceLevels(level, level + 1);
+    new_lod_tensor.ShrinkLevels(level, level + 1);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
     ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
     ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
@@ -68,7 +68,7 @@ TEST_F(LoDTensorTester, SliceLevels) {
   // slice 2 level
   for (size_t level = 0; level < 2UL; ++level) {
     LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.SliceLevels(level, level + 2);
+    new_lod_tensor.ShrinkLevels(level, level + 2);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
     ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
     ASSERT_EQ(new_lod_tensor.NumElements(1),
@@ -77,10 +77,10 @@ TEST_F(LoDTensorTester, SliceLevels) {
   }
 }
 
-TEST_F(LoDTensorTester, SliceInLevel) {
+TEST_F(LoDTensorTester, ShrinkInLevel) {
   size_t level = 0;
   LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.SliceInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 2);
   EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
   EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
   EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
@@ -89,7 +89,7 @@ TEST_F(LoDTensorTester, SliceInLevel) {
 
   level = 1;
   new_lod_tensor = lod_tensor_;
-  new_lod_tensor.SliceInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 2);
   ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
   ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
   ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 49509af6630ada5c2ec724525ec0a6eab02679f9..fcbfc3e4377edd0ea55c8d4328c325fa18663001 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -60,8 +60,8 @@ std::string OperatorBase::Output(const std::string& name) const {
 const std::vector<std::string>& OperatorBase::Outputs(
     const std::string& name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output %s", type_,
-                 name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output called %s",
+                 type_, name);
   return it->second;
 }
 
@@ -207,23 +207,22 @@ const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
 }
 
 template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  auto* var = OutputVar(name);
-  return var == nullptr ? nullptr : const_cast<Tensor*>(GetTensorFromVar(var));
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const {
+  auto var = OutputVar(name);
+  return var == nullptr ? nullptr : var->GetMutable<LoDTensor>();
 }
 
 template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
     const std::string& name) const {
   auto names = op().Outputs(name);
   std::vector<Tensor*> res;
   res.reserve(names.size());
   std::transform(names.begin(), names.end(), std::back_inserter(res),
                  [&](const std::string& sub_name) {
-                   auto var = scope().FindVar(sub_name);
-                   return var == nullptr
-                              ? nullptr
-                              : const_cast<Tensor*>(GetTensorFromVar(var));
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr
+                                         : var->GetMutable<LoDTensor>();
                  });
   return res;
 }
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 1a78b6d1e146d2d157e353c5729d8518ee264517..2d6d5510ef6dc83f1a016be6ff123f0b9bcaf230 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -212,9 +212,9 @@ class InferShapeContext {
     return res;
   }
 
-  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
     auto names = op_.Outputs(name);
-    std::vector<const Variable*> res;
+    std::vector<Variable*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [this](const std::string& name) {
@@ -271,6 +271,20 @@ class InferShapeContext {
     return &var->Get<Tensor>();
   }
 
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, InputSize(in));
+    PADDLE_ENFORCE_LT(j, OutputSize(out));
+    auto* in_var = MultiInputVar(in)[i];
+    auto* out_var = MultiOutputVar(out)[j];
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+  }
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
@@ -283,6 +297,13 @@ template <>
 const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
     const std::string& name) const;
 
+template <>
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const;
+
+template <>
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
+    const std::string& name) const;
+
 template <typename T>
 struct EigenDeviceConverter;
 
@@ -315,38 +336,10 @@ class ExecutionContext : public InferShapeContext {
     return device_context_;
   }
 
-  // redefine Output function,
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  T* Output(const std::string& name) const {
-    auto var = OutputVar(name);
-    return var == nullptr ? nullptr : const_cast<T*>(&var->Get<T>());
-  }
-
-  // redefine MultiOutput function.
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
-    auto names = op().Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(res),
-        [&](const std::string& sub_name) { return Output<T>(sub_name); });
-    return res;
-  }
-
  private:
   const platform::DeviceContext& device_context_;
 };
 
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const;
-
 class OpKernel {
  public:
   /**
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 4b5a2ae523f2f7fde5445f0534cd99969ad9d59e..f040c09c089ec75c9773d752685be5e232e8f4b7 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,16 +29,19 @@ limitations under the License. */
 
 namespace paddle {
 
-namespace framework {
+namespace pybind {
 namespace details {
 template <bool less, size_t i, typename... args>
 struct CastToPyBufferImpl;
 }
+}  // namespace pybind
+
+namespace framework {
 
 class Tensor {
  public:
   template <bool less, size_t i, typename... args>
-  friend struct details::CastToPyBufferImpl;
+  friend struct pybind::details::CastToPyBufferImpl;
 
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
@@ -165,12 +168,6 @@ class Tensor {
   /*! points to dimensions of memory block. */
   DDim dims_;
 
-  /**
-   * A cache of the number of elements in a tensor.
-   * Would be 0 for an uninitialized tensor.
-   */
-  int64_t numel_;
-
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
    *
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 6d2c14f4c47afb755b1c74f6dc4dd10ab25ed191..a5405f9c31543b5733f9db923c2a6f8b968cfc2d 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -147,13 +147,12 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
 
 inline Tensor& Tensor::Resize(const DDim& dims) {
   dims_ = dims;
-  numel_ = product(dims_);
   return *this;
 }
 
 inline const DDim& Tensor::dims() const { return dims_; }
 
-inline int64_t Tensor::numel() const { return numel_; }
+inline int64_t Tensor::numel() const { return product(dims_); }
 
 template <typename T>
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 0c813748b2989a8f0c00a359345747242dd21dd8..70e4f9da1221ab300e2b507a3da2f7c5da93f2e4 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -39,7 +39,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(inference->dims()[0], label->dims()[0],
                       "inference size must be the same as label size");
 
-    ctx.Output<framework::LoDTensor>("Accuracy")->Resize({1});
+    ctx.Output<framework::Tensor>("Accuracy")->Resize({1});
+    ctx.ShareLoD("Inference", /*->*/ "Accuracy");
   }
 };
 
@@ -54,11 +55,15 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
 
-    AddComment(
-        R"DOC(Accuracy. It will print accuracy rate for classification.
+    AddComment(R"DOC(
+Accuracy. It will print accuracy rate for classification.
 The accuracy is:
 ..  math::
-accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})DOC");
+accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
+
+Both the input `Inference` and `Label` can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD with input `Inference`.
+)DOC");
   }
 };
 
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index cc55767cef9552475321bcb8c06d74a8d91dc99b..06654702bc42cc7cf4917b00693334b1d36ce371 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -23,8 +23,9 @@ class ActivationOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
+    ctx.Output<framework::Tensor>("Y")->Resize(
         ctx.Input<framework::Tensor>("X")->dims());
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
@@ -34,7 +35,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<framework::Tensor>("Y")->dims());
   }
 };
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index e83c1efeaf897889d18a37a6bd2ca2f8f012db25..ed11d096974341022637676537793645f46738f0 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -33,7 +33,7 @@ class AddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                       ctx.Input<Tensor>("Y")->dims(),
                       "Two input of Add Op's dimension must be same.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 86d79866a8e7c4cda036ce7e0f5527fd0086b482..e5a54bc4b226fd24337050fdd84b2de9c49f7949 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -17,8 +17,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::LoDTensor;
-
 class ClipOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -29,11 +27,12 @@ class ClipOp : public framework::OperatorWithKernel {
                             "Input(X) of ClipOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) of ClipOp should not be null.");
-    auto x_dims = ctx.Input<LoDTensor>("X")->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto max = Attr<float>("max");
     auto min = Attr<float>("min");
     PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
-    ctx.Output<LoDTensor>("Out")->Resize(x_dims);
+    ctx.Output<Tensor>("Out")->Resize(x_dims);
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -66,8 +65,8 @@ class ClipOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<LoDTensor>("X")->dims();
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     if (x_grad != nullptr) {
       x_grad->Resize(x_dims);
     }
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 223bb0ffe6e75ce71919eb5f4cca06bedbb00764..07f847079e834716904dcc038d2097efd268bd3e 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -29,7 +29,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                             "Output(Out) of ConcatOp should not be null.");
 
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
     size_t n = ins.size();
 
diff --git a/paddle/operators/conv2d_op.cc b/paddle/operators/conv2d_op.cc
index 12db65b5cbf224e95d91c7b4839afa552c084ee7..c3281db0964de6d7dd6be629fbcc55cabb9fef9d 100644
--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -37,7 +37,7 @@ class Conv2DOp : public framework::OperatorWithKernel {
 
     auto in = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
-    auto out = ctx.Output<framework::LoDTensor>("Output");
+    auto out = ctx.Output<framework::Tensor>("Output");
     std::vector<int> strides = Attr<std::vector<int>>("strides");
     std::vector<int> paddings = Attr<std::vector<int>>("paddings");
     int groups = Attr<int>("groups");
@@ -111,10 +111,9 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto in = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
-    auto d_in =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Input"));
+    auto d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
     auto d_filter =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Filter"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Filter"));
     if (d_in) d_in->Resize(in->dims());
     if (d_filter) d_filter->Resize(filter->dims());
   }
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 72c446493684246959656dc048e7f0e761665423..b56ee2047b811e212b4bf74bf7fbba753a6bcb11 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -54,9 +54,10 @@ class CosSimOp : public framework::OperatorWithKernel {
                    " just 1 (which will be broadcasted to match Input(X)).");
 
     // resize tensor
-    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("XNorm")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -81,10 +82,13 @@ Cosine Similarity Operator.
 
 The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y)).
 
-Input(X) and Input(Y) must have the same shape, except that the 1st dimension
-of Input(Y) could be just 1 (different from Input(X)), which will be
-broadcasted to match the shape of Input(X) before computing their cosine
+The input `X` and `Y` must have the same shape, except that the 1st dimension
+of input `Y` could be just 1 (different from input `X`), which will be
+broadcasted to match the shape of input `X` before computing their cosine
 similarity.
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
@@ -139,10 +143,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
                       "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
 
     // resize tensor
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
     if (x_grad) x_grad->Resize(x_dims);
     if (y_grad) y_grad->Resize(y_dims);
   }
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 7ed21f336f69e494f3c4039c609c83407a80cd8c..52a1123348b10e39bcfa1ba062c893e5f20ed862 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -19,7 +19,6 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
-using framework::LoDTensor;
 
 class CropOp : public framework::OperatorWithKernel {
  public:
@@ -31,9 +30,9 @@ class CropOp : public framework::OperatorWithKernel {
                             "Input(X) of CropOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) of CropOp should not be null.");
-    auto x_dim = ctx.Input<LoDTensor>("X")->dims();
-    auto *y = ctx.Input<LoDTensor>("Y");
-    auto *out = ctx.Output<LoDTensor>("Out");
+    auto x_dim = ctx.Input<Tensor>("X")->dims();
+    auto *y = ctx.Input<Tensor>("Y");
+    auto *out = ctx.Output<Tensor>("Out");
     if (y == nullptr) {
       auto shape = Attr<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
@@ -121,8 +120,8 @@ class CropOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<LoDTensor>("X")->dims();
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     if (x_grad != nullptr) {
       x_grad->Resize(x_dims);
     }
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 953367eb8bcd1282ab6c7e1189d778f0ce3da541..b11dc1472d153dd188a0b3553d6950774216a3fd 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::LoDTensor;
-
 class CrossEntropyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -35,23 +33,21 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
                       "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                       "The 1st dimension of Input(X) and Input(Label) must "
                       "be equal.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "If Attr(soft_label) == true, The 2nd dimension of "
                         "Input(X) and Input(Label) must be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "If Attr(soft_label) == false, The 2nd dimension of "
                         "Input(Label) must be 1.");
     }
 
-    ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
+    ctx.Output<Tensor>("Y")->Resize({x->dims()[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
@@ -74,9 +70,6 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
                       "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                       "The 1st dimension of Input(X) and Input(Label) must "
                       "be equal.");
@@ -85,17 +78,17 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
                       "The 2nd dimension of Input(Y@Grad) must be 1.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "If Attr(soft_label) == true, The 2nd dimension of "
                         "Input(X) and Input(Label) must be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "If Attr(soft_label) == false, The 2nd dimension of "
                         "Input(Label) must be 1.");
     }
 
-    auto dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     dx->Resize(x->dims());
   }
 };
@@ -108,7 +101,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The first input of CrossEntropyOp");
     AddInput("Label", "The second input of CrossEntropyOp");
     AddOutput("Y", "The output of CrossEntropyOp");
-    AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
+    AddAttr<bool>("soft_label", "Is soft label. Default zero.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 CrossEntropy Operator.
@@ -116,12 +110,12 @@ CrossEntropy Operator.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    soft_label = 0, Label[i, 0] indicates the class index for sample i:
+    soft_label = False, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    soft_label = 1, Label[i, j] indicates the soft label of class j
+    soft_label = True, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
@@ -133,6 +127,9 @@ computation.
      As a special case of 2), when each row of Input(Label) has only one
      non-zero element (equals 1), soft-label cross-entropy degenerates to a
      one-hot cross-entropy with one-hot label representation.
+
+Both the input `X` and `Label` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index ab6ad0e062269483948bf70e492c9431991221fb..1d6361a81472a49729958120c52060b1dff803f2 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -102,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
     int grid = (n + block - 1) / block;
     // TODO(qingqing) launch kernel on specified stream
     // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
                                                  d);
@@ -137,7 +137,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
     grid = (n + block - 1) / block;
     // TODO(qingqing): launch kernel on specified stream
     // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
           dx_data, dy_data, x_data, label_data, n, d);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 1b4b23ac2029138afadef0168262203ac2e20430..69caba5ff31f60df2c24cef0e6331f058f6ba8d6 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -51,7 +51,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
     int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
 
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       int index = 0;
       for (int i = 0; i < batch_size; ++i) {
@@ -92,7 +92,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
     int class_num = x->dims()[1];
 
     // TODO(qingqing): make zero setting an common function.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       int index = 0;
       for (int i = 0; i < batch_size; ++i) {
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index b111b9fccb2310bd5fb92bda878a497c51f62ce0..2130eda6a42c893d8ec251a7022a0bfa44433bb7 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
-using framework::LoDTensor;
 
 class DropoutOp : public framework::OperatorWithKernel {
  public:
@@ -29,15 +28,13 @@ class DropoutOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
     PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
 
     auto dims = ctx.Input<Tensor>("X")->dims();
-    ctx.Output<LoDTensor>("Out")->Resize(dims);
-    if (ctx.Attr<int>("is_training") == 1) {
-      ctx.Output<LoDTensor>("Mask")->Resize(dims);
+    ctx.Output<Tensor>("Out")->Resize(dims);
+    if (ctx.Attr<bool>("is_training")) {
+      ctx.Output<Tensor>("Mask")->Resize(dims);
     }
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -49,8 +46,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
         .SetDefault(.5f);
-    // TODO(xinghai-sun): use bool for is_training after bool is supported.
-    AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
+    AddAttr<bool>("is_training", "Whether in training phase.").SetDefault(true);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
@@ -59,7 +55,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Dropout Operator.
 
-"Dropout" refers to randomly dropping out units in a nerual network. It is a
+'Dropout' refers to randomly dropping out units in a nerual network. It is a
 regularization technique for reducing overfitting by preventing neuron
 co-adaption during training. The dropout operator randomly set (according to
 the given dropout probability) the outputs of some units to zero, while others
@@ -75,8 +71,8 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(ctx.Attr<bool>("is_training"),
+                   "GradOp is only callable when is_training is true");
 
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
@@ -85,9 +81,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
     PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
     PADDLE_ENFORCE_EQ(x_dims, out_dims,
@@ -96,7 +89,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dims, mask_dims,
                       "Dimensions of Input(X) and Mask must be the same.");
 
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     x_grad->Resize(x_dims);
   }
 };
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 186237fb238add37f32403309a0f7e8a9846d335..a04e4a22cc09d4e8106a528e490ccf8e90681c08 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -59,7 +59,7 @@ class GPUDropoutKernel : public framework::OpKernel {
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
     auto place = context.GetEigenDevice<Place>();
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 82eafee0e0e7db7b4b4ae5405f37146d061aefd5..d57f64afcb3558aeea6aed23fae06866e9af874a 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -35,7 +35,7 @@ class CPUDropoutKernel : public framework::OpKernel {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
 
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int seed = context.Attr<int>("seed");
@@ -65,8 +65,8 @@ template <typename Place, typename T>
 class DropoutGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
+                   "GradOp is only callable when is_training is true");
 
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f7b654d69f081dfa85b0d61960eb52b7982faa1
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_add_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseAddOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseAddOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("add", "Out = X + Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
+            elementwise_add_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_avg_pool_op.cu b/paddle/operators/elementwise_add_op.cu
similarity index 74%
rename from paddle/operators/sequence_avg_pool_op.cu
rename to paddle/operators/elementwise_add_op.cu
index bc9d1611fccd17c99b914b6ef59995288a9ebbd6..85d063a76b5592c716a5bdf23a0993976abc6ae4 100644
--- a/paddle/operators/sequence_avg_pool_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
@@ -13,13 +13,13 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-
-#include "paddle/operators/sequence_avg_pool_op.h"
+#include "paddle/operators/elementwise_add_op.h"
 
 namespace ops = paddle::operators;
+
 REGISTER_OP_GPU_KERNEL(
-    sequence_avg_pool,
-    ops::SequenceAvgPoolKernel<paddle::platform::GPUPlace, float>);
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    sequence_avg_pool_grad,
-    ops::SequenceAvgPoolGradKernel<paddle::platform::GPUPlace, float>);
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e9f1ffba6fb23f5394713c67aa4363b85717f50
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseAddKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseAddGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.sum();
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseAddGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
+                           ElementwiseAddOneGradFunctor<T>,
+                           ElementwiseAddBroadCastGradFunctor<T>,
+                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6898150d310d0c4fdefae5a58a5792a72f9889e
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_div_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseDivOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Div", "Out = X / Y");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
+            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b96aa31748c77f0d07f9bb7fb19235239983abd5
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_div_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bd7c8ea548c46ec9b4c5a085e4e70d5dd162f3a
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseDivKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseDivGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseDivGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivBroadCastGradFunctor<T>,
+                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index ee6e975b443691bf71cec904565ced20406f3fba..f2544b54d6bc543a50d8de03d482333b485bc076 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -17,101 +17,25 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
-class ElementWiseMulOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of ElementWiseMulOp should not be null.");
-
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto y_dim = ctx.Input<Tensor>("Y")->dims();
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
-    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dim);
-  }
-};
-
-class ElementWiseMulOpMaker : public framework::OpProtoAndCheckerMaker {
+class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
-  ElementWiseMulOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of elementwise mul op");
-    AddInput("Y", "The second input of elementwise mul op");
-    AddAttr<int>("axis",
-                 R"DOC(
-When shape(Y) does not equal shape(X),Y will be broadcasted 
-to match the shape of X and axis should be dimension index Y in X
-        )DOC")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-
-    AddOutput("Out", "The output of elementwise mul op");
-    AddComment(R"DOC(
-Limited elementwise multiple operator.The equation is: Out = X ⊙ Y.
-1. The shape of Y should be same with X or
-2. Y's shape is a subset of X. 
-   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
-   example:
-      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-)DOC");
+  ElementwiseMulOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Mul", "Out = X ⊙ Y");
+    AddComment(comment_);
   }
 };
 
-class ElementWiseMulOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
-
-    if (x_grad) {
-      x_grad->Resize(x_dims);
-    }
-
-    if (y_grad) {
-      y_grad->Resize(y_dims);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementWiseMulOp, ops::ElementWiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementWiseMulOpGrad);
+REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
+            elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index 56f2087c22c6c599a3c5aef36eb0fe3eac295bef..da08a75596c4d3b89dc8892bd4405464fec96389 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -19,7 +19,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 6d58da580b81b9e0a8ae170eec1a73638b190df8..1eaf2e3efc97a32739efcaf37066817ee173fadc 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -13,171 +13,104 @@
    limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
-/*
- * Out = X ⊙ Y
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- */
-
-inline void get_mid_dims(const framework::DDim& x_dims,
-                         const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
-  pre = 1;
-  n = 1;
-  post = 1;
-  for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
-                      "Broadcast dimension mismatch.");
-    n *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
-  }
-}
 
 template <typename Place, typename T>
-class ElementWiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
+    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
+  }
+};
 
+template <typename T>
+struct ElementwiseMulGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
 
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
-
-    if (x_dims == y_dims || product(y_dims) == 1) {
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_e;
-      return;
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e;
     }
 
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                   "Axis should be in range [0, x_dims)");
-
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-    if (post == 1) {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
-    } else {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = x_e * dz_e;
     }
   }
 };
 
-template <typename Place, typename T>
-class ElementWiseMulGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
+template <typename T>
+struct ElementwiseMulBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dout_e = framework::EigenVector<T>::Flatten(*dout);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
 
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
     }
+
     if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
     }
+  }
+};
 
-    if (x_dims == y_dims || product(y_dims) == 1) {
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) = x_e * dout_e;
-      }
-      return;
+template <typename T>
+struct ElementwiseMulBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
     }
 
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-
-    // TODO(gongweibao): wrap reshape to a function.
-    if (post == 1) {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                           .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 2>(pre, n))
-                .sum(Eigen::array<int, 1>{{0}});
-      }
-      return;
-    } else {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                           .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                .sum(Eigen::array<int, 2>{{0, 2}});
-      }
-      return;
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
     }
   }
 };
 
+template <typename Place, typename T>
+class ElementwiseMulGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulBroadCastGradFunctor<T>,
+                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f224722c1bec6716e68de9da2509250f7d4b37ae
--- /dev/null
+++ b/paddle/operators/elementwise_op.h
@@ -0,0 +1,312 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
+ */
+inline void get_mid_dims(const framework::DDim& x_dims,
+                         const framework::DDim& y_dims, const int axis,
+                         int& pre, int& n, int& post) {
+  pre = 1;
+  n = 1;
+  post = 1;
+  for (int i = 0; i < axis; ++i) {
+    pre *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
+                      "Broadcast dimension mismatch.");
+    n *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    post *= x_dims[i];
+  }
+}
+
+#define EIGEN_FUNCTOR(name, eigen_op)                                          \
+  struct Eigen##name##Functor {                                                \
+    template <typename Place, typename T>                                      \
+    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
+                    framework::Tensor* z,                                      \
+                    const framework::ExecutionContext& ctx) {                  \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast(const framework::Tensor* x,                       \
+                             const framework::Tensor* y, framework::Tensor* z, \
+                             const framework::ExecutionContext& ctx, int pre,  \
+                             int n) {                                          \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast2(const framework::Tensor* x,                      \
+                              const framework::Tensor* y,                      \
+                              framework::Tensor* z,                            \
+                              const framework::ExecutionContext& ctx, int pre, \
+                              int n, int post) {                               \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+  }
+
+template <class functor, typename Place, typename T>
+void ElementwiseCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.")
+
+  if (x_dims == y_dims || product(y_dims) == 1) {
+    functor f;
+    f.template Run<Place, T>(x, y, z, ctx);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor f;
+    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    return;
+  } else {
+    functor f;
+    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    return;
+  }
+}
+
+#define EIGEN_ADD(x, y) ((x) + (y))
+EIGEN_FUNCTOR(Add, EIGEN_ADD);
+
+#define EIGEN_SUB(x, y) ((x) - (y))
+EIGEN_FUNCTOR(Sub, EIGEN_SUB);
+
+#define EIGEN_MUL(x, y) ((x) * (y))
+EIGEN_FUNCTOR(Mul, EIGEN_MUL);
+
+#define EIGEN_DIV(x, y) ((x) / (y))
+EIGEN_FUNCTOR(Div, EIGEN_DIV);
+
+template <typename Place, typename T, typename functor, typename functor1,
+          typename broadcastfunctor, typename broadcast2functor>
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Input<Tensor>("Out");
+  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+  auto place = ctx.GetEigenDevice<Place>();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  if (dx) {
+    dx->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dy) {
+    dy->mutable_data<T>(ctx.GetPlace());
+  }
+
+  if (x_dims == y_dims) {
+    functor f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  if (product(y_dims) == 1) {
+    functor1 f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+
+  if (post == 1) {
+    broadcastfunctor f;
+    f(place, x, y, out, dx, dy, dout, pre, n);
+    return;
+  } else {
+    broadcast2functor f;
+    f(place, x, y, out, dx, dy, dout, pre, n, post);
+    return;
+  }
+}
+
+class ElementwiseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  using Tensor = framework::Tensor;
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of elementwise op should not be null.");
+
+    auto x_dim = ctx.Input<Tensor>("X")->dims();
+    auto y_dim = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.")
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ElementwiseOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", R"DOC(
+The first input of elementwise op, it's a tensor of any dimensions.
+)DOC");
+    AddInput("Y", R"DOC(
+The sencond input of elementwise op, it's a tensor and it's dimensions
+must be small or equal to X's dimensions.
+)DOC");
+    AddAttr<int>("axis",
+                 R"DOC(
+When the shape(Y) does not equal the shape(X),Y will be broadcasted 
+to match the shape of X and axis should be dimension index Y in X
+        )DOC")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+
+    AddOutput("Out", "The output of elementwise op");
+    comment_ = R"DOC(
+Limited elementwise {name} operator.The equation is: Out = {equation}.
+1. The shape of Y should be same with X or
+2. Y's shape is a subset of X. 
+   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
+
+   example:
+      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input X.
+)DOC";
+    AddComment(comment_);
+  }
+
+ protected:
+  std::string comment_;
+
+  void Replace(std::string& src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+
+  void SetComment(std::string name, std::string equation) {
+    Replace(comment_, "{name}", name);
+    Replace(comment_, "{equation}", equation);
+  }
+};
+
+class ElementwiseOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  using Tensor = framework::Tensor;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.")
+
+    if (x_grad) {
+      x_grad->Resize(x_dims);
+    }
+
+    if (y_grad) {
+      y_grad->Resize(y_dims);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31c37ff7ab5595c29f973929387d3945b6f3aaf8
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_sub_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseSubOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseSubOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Sub", "Out = X - Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
+            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0efb92fce9975ed9fa029a3ce919589d09efb0d7
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_sub_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6bc66cd0e1594a8bc7070e2f182401b92d1c88e
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseSubKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseSubGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e.sum();
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseSubGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
+                           ElementwiseSubOneGradFunctor<T>,
+                           ElementwiseSubBroadCastGradFunctor<T>,
+                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index e5d0f3c3724262a60a463ef3beadd9906d3ebaf6..5ac0e8cc45f007d42f1b6d7f86333f5cbedb3ea8 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -186,6 +186,9 @@ W_i is a 2-D matrix of size (K x N), where N means the number of neurons
 in the fully connected layer. B is a 1-D vector of size N.
 Thus, the output Out is a 2-D matrix of size (M x N).
 Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
+
+All the inputs can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with first input (`X[0]`).
 )DOC");
   }
 };
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index ba7857cc65f6860a6156674c6addc2bfdce21a99..761a527a5574edc779340ec595dfe1bc1964438a 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -23,15 +23,14 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("Src"),
-        "Input(Src) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Dst"),
-        "Output(Dst) of FillZerosLikeOp should not be null.");
-
-    ctx.Output<framework::LoDTensor>("Dst")->Resize(
-        ctx.Input<framework::Tensor>("Src")->dims());
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) of FillZerosLikeOp should not be null.");
+
+    ctx.Output<framework::Tensor>("Y")->Resize(
+        ctx.Input<framework::Tensor>("X")->dims());
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
@@ -40,8 +39,8 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
   FillZerosLikeOpMaker(framework::OpProto *proto,
                        framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "The input of fill-zeros-like op.");
-    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddInput("X", "The input of fill-zeros-like op.");
+    AddOutput("Y", "The varibale will be filled up with zeros.");
     AddComment(R"DOC(
 Fill up a vriable with zeros.
 
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 969998ce2eae02b8ad057c6259703e51559bf98a..4474581784531faee1741f0b143743e31cc3788f 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* output = context.Output<framework::Tensor>("Dst");
+    auto* output = context.Output<framework::Tensor>("Y");
     output->mutable_data<T>(context.GetPlace());
     auto t = framework::EigenVector<T>::Flatten(*output);
     t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index d445b61c1657356f2cdcf1e98d756607de2bd042..fecd1ce2147a1e6f2f7928266be74ed7b647c5b9 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -35,7 +35,7 @@ class GatherOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
     output_dims[0] = batch_size;
-    ctx.Output<framework::LoDTensor>("Out")->Resize(output_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(output_dims);
   }
 };
 
@@ -45,7 +45,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto X_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto X = ctx.Input<Tensor>("X");
 
     X_grad->Resize(X->dims());
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index c0e161bbc0c5486eb10408e43e6388f1b287abf8..5b7cbb5cc7bcb7e43b15363d37d7b8f2cbf0fbdc 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -48,7 +48,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
         ctx.OutputVar("Out"),
         "Output(Out) of GaussianRandomOp should not be null.");
 
-    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 07f6dfabca5879e3de6004e59d2e87f7fa68d66c..04ac24662e9cfec6a49cd213cb76bdebc7b730c8 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -32,9 +32,10 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_t = ctx.Input<Tensor>("W");
     auto ids_t = ctx.Input<Tensor>("Ids");
-    auto output_t = ctx.Output<framework::LoDTensor>("Out");
+    auto output_t = ctx.Output<framework::Tensor>("Out");
 
     output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
+    ctx.ShareLoD("Ids", /*->*/ "Out");
   }
 };
 
@@ -50,9 +51,13 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
              "An input with type int32 or int64"
              "contains the ids to be looked up in W.");
     AddOutput("Out", "The lookup results, which have the same type with W.");
-    AddComment(
-        "This operator is used to perform lookups on the parameter W,"
-        "then concatenated into a dense tensor.");
+    AddComment(R"DOC(
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input `Ids` can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD with input `Ids`.
+)DOC");
   }
 };
 
@@ -64,7 +69,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &context) const override {
     auto table = context.Input<Tensor>("W");
     auto d_table =
-        context.Output<framework::LoDTensor>(framework::GradVarName("W"));
+        context.Output<framework::Tensor>(framework::GradVarName("W"));
     d_table->Resize(table->dims());
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 7d7eeb59a23435036dc33c1e4fe6dd1c4a1a2f62..b04384bda81b93f5db0be3206eee10ad5e854540 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -27,7 +27,7 @@ class MeanOp : public framework::OperatorWithKernel {
                             "Input(X) of MeanOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) of MeanOp should not be null.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize({1});
+    ctx.Output<framework::Tensor>("Out")->Resize({1});
   }
 };
 
@@ -37,7 +37,8 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op").NotInGradient();
-    AddComment("Mean Operator");
+    AddComment(R"DOC( Mean Operator
+)DOC");
   }
 };
 
@@ -47,7 +48,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index a97bbecdca1779df330d1053cf359bb658aa75c2..29cb85489bd05f6c1e7143d962eac0af26e75825 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -40,7 +40,8 @@ class MinusOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         left_tensor->numel(), right_tensor->numel(),
         "Minus operator must take two tensor with same num of elements");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(left_tensor->dims());
+    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -54,7 +55,12 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(Minus Operator
 
-Equation: Out = X - Y
+Equation:
+
+    Out = X - Y
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 6fe018f9a8fd74479a2feed07379c3179b7c72bd..8606c0d1e1bf7a52299528d30af0367d9f93edd2 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -34,8 +34,8 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x->dims().size(), 2, "The tensor rank of X must be 2.");
     PADDLE_ENFORCE_EQ(x->dims()[1], 1, "The 2nd dimension of X must be 1.");
 
-    context.Output<framework::LoDTensor>("IntermediateVal")->Resize(x->dims());
-    context.Output<framework::LoDTensor>("Out")->Resize({x->dims()[0], 1});
+    context.Output<framework::Tensor>("IntermediateVal")->Resize(x->dims());
+    context.Output<framework::Tensor>("Out")->Resize({x->dims()[0], 1});
   }
 };
 
@@ -81,7 +81,7 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
     auto* intermediate_val = context.Input<Tensor>("IntermediateVal");
     auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* x_grad =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
 
     PADDLE_ENFORCE_NOT_NULL(x, "X must be initialized.");
     PADDLE_ENFORCE_NOT_NULL(y, "Y must be initialized.");
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
index 2b2aae17084992c4935a697763ff902e455dfcbd..cb51007749e3c59572d4852959f4119ac377decc 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -52,8 +52,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
     auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<framework::LoDTensor>("IntermediateVal");
-    auto* out1 = context.Output<framework::LoDTensor>("Out");
+    auto* out0 = context.Output<framework::Tensor>("IntermediateVal");
+    auto* out1 = context.Output<framework::Tensor>("Out");
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
@@ -77,11 +77,9 @@ class ModifiedHuberLossGradCPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<framework::LoDTensor>("IntermediateVal");
-    auto* in2 =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto* out0 =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* in1 = context.Input<framework::Tensor>("IntermediateVal");
+    auto* in2 = context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<framework::Tensor>(framework::GradVarName("X"));
 
     if (out0) {
       const T* y_ptr = in0->data<T>();
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index b6d320b415e02549e85cb36ab517b0b5433887d5..7047718a3f1bf7e9598952efa1d9bcb20d5cf5b4 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
-using framework::LoDTensor;
 
 class MulOp : public framework::OperatorWithKernel {
  public:
@@ -53,8 +52,9 @@ class MulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_mat_dims[1], y_mat_dims[0],
         "First matrix's width must be equal with second matrix's height.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         {x_mat_dims[0], y_mat_dims[1]});
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -83,9 +83,14 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Two Element Mul Operator.
+Mul operator is used to perform matrix multiplication for input X and Y.
 
-The equation is: Out = X * Y
+The equation is:
+
+    Out = X * Y
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
@@ -103,10 +108,8 @@ class MulOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
 
     auto x_mat_dims =
         framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e77b86b5698a263b850a973cd1b8644a0aa2201
--- /dev/null
+++ b/paddle/operators/multiplex_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class MultiplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
+                   "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) shouldn't be null.");
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto *out = ctx.Output<LoDTensor>("Out");
+    auto num_ins = ins.size();
+    PADDLE_ENFORCE(num_ins > 2,
+                   "multiplex operator should have more than 2 inputs.");
+    PADDLE_ENFORCE_EQ(ins[0]->dims().size(), 1,
+                      "The first input must be a index vector.");
+    auto in_dim = ins[1]->dims();
+
+    for (size_t i = 2; i < num_ins; i++) {
+      auto dim = ins[i]->dims();
+      PADDLE_ENFORCE(
+          in_dim == dim,
+          "All the input tensors except the first one must have the same size");
+    }
+    out->Resize(in_dim);
+  }
+};
+
+class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MultiplexOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensors of multiplex operator.").AsDuplicable();
+    AddOutput("Out", "The output tensor of multiplex operator.");
+    AddComment(R"DOC(Multiplex operator
+
+Multiplex multiple tensors according to the index provided by the first
+input tensor.
+
+ins[0]: the index tensor.
+ins[1:N]: the candidate output tensors.
+For each index i from 0 to batchSize - 1, the output is the i-th row of the
+the (index[i] + 1)-th tensor.
+
+For i-th row of the output tensor:
+
+y[i][j] = x_{k}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+
+where y is the output tensor. `x_{k}` is the k-th input tensor
+and `k = x{0}[i] + 1`.
+
+)DOC");
+  }
+};
+
+class MultiplexGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
+                   "Input(X) should not be null");
+    PADDLE_ENFORCE(!ctx.MultiOutputVar(framework::GradVarName("X")).empty(),
+                   "Output(X@Grad) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) shouldn't be null.");
+    auto d_ins = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
+    auto ins = ctx.MultiInput<Tensor>("X");
+    // don't compute gradient for index (ins[0])
+    for (size_t i = 1; i < ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->Resize(ins[i]->dims());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
+            ops::MultiplexGradOp);
+REGISTER_OP_CPU_KERNEL(
+    multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4736f15bd594178168e3bcf799142d0fc18bff13
--- /dev/null
+++ b/paddle/operators/multiplex_op.cu
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MultiplexGPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    // copy index to cpu
+    framework::Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+    auto* index = index_t_cpu.data<T>();
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      PADDLE_ENFORCE_LT(k, ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MultiplexGradGPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto d_ins =
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    for (size_t i = 1; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    // copy index to cpu
+    framework::Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+    auto* index = index_t_cpu.data<T>();
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    multiplex, ops::MultiplexGPUKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradGPUKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..44e8e0c1998014081b7e0aac603d573aba1f4a13
--- /dev/null
+++ b/paddle/operators/multiplex_op.h
@@ -0,0 +1,78 @@
+
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MultiplexCPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    auto* index = ins[0]->data<T>();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      PADDLE_ENFORCE_LT(k, ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MultiplexGradCPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto d_ins =
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    for (size_t i = 1; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    auto* index = ins[0]->data<T>();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T));
+      }
+    }
+  }
+};
+}
+}
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index a0b1c6b631d97a40d774f7d2ff9550fda9c32db4..375d8a35acc0716259071c31bc332fdf5aabce1c 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -39,8 +39,13 @@ class PadOp : public framework::OperatorWithKernel {
     for (int i = 0; i < x_dim.size(); ++i) {
       out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
     }
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx.ShareLoD("X", /*->*/ "Out");
+    }
   }
 };
 
@@ -101,7 +106,7 @@ class PadOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
     auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_g = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *x_g = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     if (x_g != nullptr) {
       x_g->Resize(x_dims);
     }
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 7ae80b296850f2f433c89d904ebf32355b2a29c7..912196c190b5ddbd4e3482a5314e949186b94368 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -36,8 +36,9 @@ class PReluOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) should not be null");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     out->Resize(in->dims());
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -55,6 +56,8 @@ The equation is:
   f(x) = alpha * x , for x < 0
   f(x) = x         , for x >= 0
 
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD with input `X`.
 )DOC");
   }
 };
@@ -69,11 +72,11 @@ class PReluGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
-    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto *x = ctx.Input<framework::Tensor>("X");
 
     auto *dalpha =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Alpha"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Alpha"));
     auto *alpha = ctx.Input<framework::Tensor>("Alpha");
 
     dx->Resize(x->dims());
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 4bba4200728ebf7e7810ed935f6fdf51c96cbc7a..39af08c8751c3b95cf5fdef7395186a0176a20a2 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -40,7 +40,7 @@ class RankLossOp : public framework::OperatorWithKernel {
                    "All inputs must have the same size");
     PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
                    "All inputs must be row vector with size batch_size x 1.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(label_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(label_dims);
   }
 };
 
@@ -102,9 +102,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
                             "Input(Out@GRAD) shouldn't be null.");
     auto dims = ctx.Input<framework::Tensor>("Left")->dims();
     auto *left_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Left"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
     auto *right_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Right"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
     if (left_grad) {
       left_grad->Resize(dims);
     }
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index 9776d123fe4b0cb0cd16a15770fcf42a966fa011..7df195ff47ecfd79388385eed4bd37b8c9b45979 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -24,7 +24,7 @@ template <typename Place, typename T>
 class RankLossKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
     auto* label_t = ctx.Input<framework::Tensor>("Label");
     auto* left_t = ctx.Input<framework::Tensor>("Left");
     auto* right_t = ctx.Input<framework::Tensor>("Right");
@@ -46,9 +46,9 @@ class RankLossGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_left_t =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Left"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
     auto* d_right_t =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Right"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
 
     auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* label_t = ctx.Input<framework::Tensor>("Label");
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index ad985839f5908d9235a4dbefc9b841362810114e..e7deaf9940699b938e4f36358c2c7f3ba15e918b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -80,7 +80,6 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
   // Now all variables in scope must be created outside of op.
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
   PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
-  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "net_op has no outputs");
 
   if (seq_len_ > step_scopes->size()) {
     for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
@@ -129,8 +128,8 @@ const rnn::ArgumentName RecurrentOp::kArgName{
     "memories", "pre_memories", "boot_memories"};
 
 const rnn::ArgumentName RecurrentGradientOp::kArgName{
-    "step_net", "step_scopes",  "outlink@grad",      "inlink@grad",
-    "memories", "pre_memories", "boot_memories@grad"};
+    "step_net", "step_scopes@GRAD", "outlinks@GRAD",     "inlinks@GRAD",
+    "memories", "pre_memories",     "boot_memories@GRAD"};
 
 RecurrentOp::RecurrentOp(const std::string& type,
                          const framework::VariableNameMap& inputs,
@@ -226,13 +225,13 @@ RecurrentGradientOp::RecurrentGradientOp(
     const framework::VariableNameMap& outputs,
     const framework::AttributeMap& attrs)
     : OperatorBase(type, inputs, outputs, attrs) {
-  rnn::InitArgument(kArgName, &arg_, *this);
+  rnn::InitArgument(kArgName, &arg_, *this, true /*is grad*/);
   alg_.Init(&arg_, &stepnet_);
 }
 
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    recurrent, paddle::operators::RecurrentOp,
-    paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
+REGISTER_OP(recurrent, paddle::operators::RecurrentOp,
+            paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker,
+            recurrent_grad, paddle::operators::RecurrentGradientOp);
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index 1033d657a3a8f96c8b3dae8dd93d3f1f6840b59b..ad4df9e55b91dbe89c34762945cd9edefde86e08 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 
 // The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
-// TODO(Yan Chunwei):
+// TODO(Superjom)
 // 1. No-padding computing for sequences with indifinite length in one batch.
 // 2. Hierarchical RNN for sequence with sub-sequence.
 // 3. Internal Memory.
@@ -177,6 +177,9 @@ class RecurrentGradientOp : public framework::OperatorBase {
 
   static const rnn::ArgumentName kArgName;
 
+  /*
+   * set a stepnet that is created according to a RecurrentOp's stepnet.
+   */
   void set_stepnet(std::unique_ptr<OperatorBase> net) {
     stepnet_ = std::move(net);
   }
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index d5d04dac275157e9538e8fdbcfb64b61e7ea657a..8874f258715e28a956c06dd98aa387d5cc690c70 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -49,7 +49,12 @@ class ReshapeOp : public framework::OperatorWithKernel {
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
-    ctx.Output<framework::LoDTensor>("Out")->Resize(out_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(out_dims);
+    if (shape[0] == in->dims()[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx.ShareLoD("X", /*->*/ "Out");
+    }
   }
 };
 
@@ -93,7 +98,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) shouldn't be null.");
     auto dims = ctx.Input<framework::Tensor>("X")->dims();
-    auto *d_in = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *d_in = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     d_in->Resize(dims);
   }
 };
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index ca7219b26d83eb6b8db75a5ed9cd360c5ac1d5df..a767009d2366e20d2ebd35f562b8df7d408f2d4e 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -109,15 +109,14 @@ void LinkMemories(const std::vector<Scope*>& scopes,
 }
 
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op) {
-  arg->step_scopes = op.Output(name.step_scopes);
-
+                  const framework::OperatorBase& op, bool is_grad) {
+  arg->step_scopes =
+      is_grad ? op.Input(name.step_scopes) : op.Output(name.step_scopes);
   arg->inlinks = op.Inputs(name.inlinks);
-
   arg->outlinks = op.Outputs(name.outlinks);
 
-  auto boot_memories = op.Inputs(name.boot_memories);
-
+  auto boot_memories =
+      is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
   // attributes
   auto memories = op.Attr<std::vector<std::string>>(name.memories);
   auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index 7dafe5d0088c4c8bf2cad163654e7e4f28eebe2e..9c777f1e9067a3e2ceb9d23f7bf7d3c73343c91f 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -78,7 +78,7 @@ void LinkMemories(const std::vector<Scope*>& step_scopes,
                   const int offset, bool infer_shape_mode);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op);
+                  const framework::OperatorBase& op, bool is_grad = false);
 
 }  // namespace rnn
 }  // namespace operators
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 2a3fd3be941d91aaa6b014df91d3025f07767577..fc3ad721f210213491617452141dfa8834b067c0 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -44,7 +44,8 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
     PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dims);
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -83,8 +84,8 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
-    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *db = ctx.Output<framework::LoDTensor>(framework::GradVarName("b"));
+    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("b"));
     if (dx) dx->Resize(x_dims);
     if (db) db->Resize(b_dims);
   }
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index d1f42e8662537d35e17429f9d436fdc0e5a1dc11..1ae77a9722ef1a5548a6c4100c32fdddcee8c5cd 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -33,8 +33,9 @@ class ScaleOp : public framework::OperatorWithKernel {
                             "Output(Out) of ScaleOp should not be null.");
 
     auto *in = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     out->Resize(in->dims());
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 8820262732327306f4f807702751708bd1e2aa36..3f02081a060281dec533c02b346f0667da28b8c3 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -44,7 +44,7 @@ class ScatterOp : public framework::OperatorWithKernel {
     framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
     for (int i = 1; i < data_dim.size(); ++i)
       PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         ctx.Input<Tensor>("Ref")->dims());
   }
 };
@@ -56,10 +56,9 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto *dUpdates =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Updates"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Updates"));
     auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *dRef =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Ref"));
+    auto *dRef = ctx.Output<framework::Tensor>(framework::GradVarName("Ref"));
     auto *Ref = ctx.Input<Tensor>("Ref");
 
     dRef->Resize(Ref->dims());
diff --git a/paddle/operators/sequence_avg_pool_op.cc b/paddle/operators/sequence_pool_op.cc
similarity index 53%
rename from paddle/operators/sequence_avg_pool_op.cc
rename to paddle/operators/sequence_pool_op.cc
index ff354c62b6f45781ebdacf22c06deb93f53adf44..6c1a2bc3fd35416e367c0cde67e05e036bb68f36 100644
--- a/paddle/operators/sequence_avg_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_avg_pool_op.h"
+#include "paddle/operators/sequence_pool_op.h"
 
 namespace paddle {
 namespace operators {
 
-class SequenceAvgPoolOp : public framework::OperatorWithKernel {
+class SequencePoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"), "Input(X) of SequenceAvgPoolOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of SequencePoolOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(
         ctx.OutputVar("Out"),
-        "Output(Out) of SequenceAvgPoolOp should not be null.");
+        "Output(Out) of SequencePoolOp should not be null.");
 
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto dims = x->dims();
@@ -42,21 +42,45 @@ class SequenceAvgPoolOp : public framework::OperatorWithKernel {
   }
 };
 
-class SequenceAvgPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceAvgPoolOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker)
+  SequencePoolOpMaker(framework::OpProto* proto,
+                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of SequenceAvgPoolOp.");
-    AddOutput("Out", "The output of SequenceAvgPoolOp.");
+    AddInput("X",
+             "A float LoDTensor, the variable-length input of SequencePoolOp");
+    AddOutput(
+        "Out",
+        "A float LoDTensor, the variable-length output of SequencePoolOp.");
+    AddAttr<int>(
+        "strategy",
+        "(int, default AVERAGE) the pooling strategy of SequencePoolOp.")
+        .SetDefault(AVERAGE)
+        .InEnum({AVERAGE, SUM, SQRT, MAX, LAST, FIRST});
     AddComment(R"DOC(
-    SequenceAvgPoolOp averages features of all time-steps of each instance.
-    More detailed comments will be added later.
+    SequencePoolOp pools features of all time-steps of each instance.
+
+    For a mini-batch of 3 variable lengths sentences, containing 2, 3, and 2 time-steps:
+    
+    Assume X is a [7,M,N] float LoDTensor, and X->lod()[0] = [0, 2, 5, 7].
+    Besides, for the sake of simplicity, we assume M=1 and N=1, 
+    and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
+
+    Thus, Out is a [3,1,1] float LoDTensor, but Out->lod() is nullptr.
+    And for different strategy, the value of Out is as follows: 
+
+    - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+    - SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+    - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), 
+           6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+    - MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+    - LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+    - FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
     )DOC");
   }
 };
 
-class SequenceAvgPoolGradOp : public framework::OperatorWithKernel {
+class SequencePoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -84,12 +108,10 @@ class SequenceAvgPoolGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_avg_pool, ops::SequenceAvgPoolOp,
-            ops::SequenceAvgPoolOpMaker, sequence_avg_pool_grad,
-            ops::SequenceAvgPoolGradOp);
+REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
+            sequence_pool_grad, ops::SequencePoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    sequence_avg_pool,
-    ops::SequenceAvgPoolKernel<paddle::platform::CPUPlace, float>);
+    sequence_pool, ops::SequencePoolKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    sequence_avg_pool_grad,
-    ops::SequenceAvgPoolGradKernel<paddle::platform::CPUPlace, float>);
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66850772d501f873cf754205c19e9d0c0090370a
--- /dev/null
+++ b/paddle/operators/sequence_pool_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/sequence_pool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_pool, ops::SequencePoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_avg_pool_op.h b/paddle/operators/sequence_pool_op.h
similarity index 62%
rename from paddle/operators/sequence_avg_pool_op.h
rename to paddle/operators/sequence_pool_op.h
index ebe0956344eb71d0fb2836f1b4a989ac546d9f78..231614b4c1cb0eb1901b1720e933aed5cbb25f77 100644
--- a/paddle/operators/sequence_avg_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -28,54 +28,85 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+enum SeqPoolType {
+  AVERAGE = 0,
+  SUM = 1,
+  SQRT = 2,  // square_root_n
+  MAX = 3,
+  LAST = 4,
+  FIRST = 5
+};
+
 template <typename Place, typename T>
-class SequenceAvgPoolKernel : public framework::OpKernel {
+class SequencePoolKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
+    int strategy = context.Attr<int>("strategy");
 
     auto dims = in->dims();
-    auto lod = in->lod();
+    auto lod = in->lod()[0];
     int64_t w = in->numel() / dims[0];
 
     out->mutable_data<T>(context.GetPlace());
     auto place = context.GetEigenDevice<Place>();
-    for (int i = 0; i < static_cast<int>(lod[0].size()) - 1; ++i) {
-      Tensor in_t = in->Slice<T>(static_cast<int>(lod[0][i]),
-                                 static_cast<int>(lod[0][i + 1]));
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      Tensor in_t =
+          in->Slice<T>(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
       Tensor out_t = out->Slice<T>(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[0][i + 1] - lod[0][i]);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
-      out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+
+      switch (strategy) {
+        case AVERAGE:
+          out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+          break;
+        case SUM:
+          out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+          break;
+        default:
+          PADDLE_THROW("unsupported pooling strategy");
+      }
     }
   }
 };
 
 template <typename Place, typename T>
-class SequenceAvgPoolGradKernel : public framework::OpKernel {
+class SequencePoolGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
     auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    int strategy = context.Attr<int>("strategy");
 
     auto dims = in->dims();
-    auto lod = in->lod();
+    auto lod = in->lod()[0];
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
     auto place = context.GetEigenDevice<Place>();
-    for (int i = 0; i < static_cast<int>(lod[0].size()) - 1; ++i) {
-      auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[0][i]),
-                                   static_cast<int>(lod[0][i + 1]));
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[i]),
+                                   static_cast<int>(lod[i + 1]));
       auto out_g_t = out_g->Slice<T>(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[0][i + 1] - lod[0][i]);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
       auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
       Eigen::DSizes<int, 2> bcast(h, 1);
-      in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+
+      switch (strategy) {
+        case AVERAGE:
+          in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+          break;
+        case SUM:
+          in_g_e.device(place) = (out_g_e).broadcast(bcast);
+          break;
+        default:
+          PADDLE_THROW("unsupported pooling strategy");
+      }
     }
   }
 };
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 1232e64c7f0132b9ea19b3d7e1ebe9531e1e25a5..b063e2427217f20eb89f7cd1af0354ad0e400feb 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -33,7 +33,7 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("param")->dims(),
                       ctx.Input<Tensor>("grad")->dims(),
                       "Two input of SGD Op's dimension must be same.");
-    ctx.Output<framework::LoDTensor>("param_out")
+    ctx.Output<framework::Tensor>("param_out")
         ->Resize(ctx.Input<Tensor>("param")->dims());
   }
 };
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 9ee6fff8db6a285a0314431e4e13b284c78c8a70..ae6d1c80b300690b070024d6266a1b99bf2ef04f 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -44,8 +44,8 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
                         "The shape of OutsideWeight must be same as X.");
     }
 
-    auto* diff = ctx.Output<framework::LoDTensor>("Diff");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* diff = ctx.Output<framework::Tensor>("Diff");
+    auto* out = ctx.Output<framework::Tensor>("Out");
     diff->Resize(x->dims());
     // loss is a two-rank tensor
     out->Resize({x->dims()[0], 1});
@@ -103,10 +103,8 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
     auto in_dims = ctx.Input<framework::Tensor>("X")->dims();
     auto out_dims =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->dims();
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
 
     PADDLE_ENFORCE_GE(out_dims.size(), 2,
                       "The tensor rank of Input(Out@Grad) should be 2.");
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index c67eb028c882ed82ca4e6a4dd70cdea9f69cdc24..e15cfe485016552971924a40a172e74a90629dce 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -30,8 +30,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
                    "The input of softmax op must be a matrix.");
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
-        ctx.Input<Tensor>("X")->dims());
+    ctx.Output<framework::Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
 
@@ -77,7 +76,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
                       "Input(Y) and its gradients should have a same shape.");
 
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index 61296f5c8122fdce7083e9a91dc313482875c805..a9d35b4fb79ae83379552ae2c2b4d694bd8f86dd 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -27,7 +27,7 @@ class SplitOp : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &ctx) const override {
     // infershape
     auto *in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
     size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
     size_t num = static_cast<size_t>(ctx.Attr<int>("num"));
     std::vector<int> sections =
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index 39f4305877de20d451bc35fe698a0eabf9758d57..33a564b05b1b490c6d23b7d17cef45b7740dfa39 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -54,9 +54,10 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
                    "First dimension of target must be equal to input "
                    "or to 1.");
 
-    ctx.Output<framework::LoDTensor>("sub_result")
+    ctx.Output<framework::Tensor>("sub_result")
         ->Resize({x_dims[0], x->numel() / x_dims[0]});
-    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -79,6 +80,9 @@ class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
     input or to 1. If the first dimension of target is 1, SquaredL2DistanceOp
     will broadcast target's first dimension to input's first dimension.
     You can decide whether calculate the gradient of input and target.
+
+    Both the input X and Y can carry the LoD (Level of Details) information,
+    or not. But the output only shares the LoD with input X.
     )DOC");
   }
 };
@@ -100,10 +104,8 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(out_dims[1], 1,
                       "Second dimension of output gradient "
                       "must be 1.");
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
     if (x_grad) x_grad->Resize(x_dims);
     if (y_grad) y_grad->Resize(y_dims);
   }
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 41e05c27f9029b2664685d3979fadcfd2bf6dbce..437fc262f359525045a4d772ee2c204ef571caa7 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -28,7 +28,7 @@ class SumOp : public framework::OperatorWithKernel {
                             "Output(Out) of SumOp should not be null.");
 
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     int N = ins.size();
 
     auto in_dim = ins[0]->dims();
@@ -39,6 +39,7 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
     }
     out->Resize(in_dim);
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -49,8 +50,11 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "the input tensors of sum operator.").AsDuplicable();
     AddOutput("Out", "the output tensor of sum operator.");
     AddComment(R"DOC(
-            Sum the input tensors.
-        )DOC");
+Sum the input tensors.
+
+All the inputs can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with the first input.
+)DOC");
   }
 };
 
@@ -61,7 +65,7 @@ class SumGradOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto outputs =
-        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     auto dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
     for (auto output : outputs) {
       output->Resize(dims);
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index 169b815feffd86f9ff04c129ccc997230ce03a8c..a6e43964e9825cd1ced9e7c1bc8d691422248fee 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -40,8 +40,8 @@ class TopkOp : public framework::OperatorWithKernel {
 
     framework::DDim dims = input->dims();
     dims[dims.size() - 1] = k;
-    ctx.Output<framework::LoDTensor>("Out")->Resize(dims);
-    ctx.Output<framework::LoDTensor>("Indices")->Resize(dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(dims);
+    ctx.Output<framework::Tensor>("Indices")->Resize(dims);
   }
 };
 
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index babf2f561c31d5436fe1611c576e6e7fc04401db..017a05326e9b397185d7c3530891884b11784783 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -51,7 +51,7 @@ class TransposeOp : public framework::OperatorWithKernel {
     for (size_t i = 0; i < axis_size; i++) {
       out_dims[i] = x_dims[axis[i]];
     }
-    ctx.Output<framework::LoDTensor>("Out")->Resize(out_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(out_dims);
   }
 };
 
@@ -99,8 +99,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
     auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     if (x_grad) x_grad->Resize(x_dims);
   }
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 184bcbc29c0d26a214345506f126f9cc0d406b07..17ea48361bc597ccfeb80884d51900e6567aa057 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -54,7 +54,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
                    "uniform_random's min must less then max");
-    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c7009a604f60cda11434ad33b6c7d7caee1befdd..fbe074188e5870de4b00fa4fff733035739974ea 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -34,12 +34,7 @@ limitations under the License. */
 namespace py = pybind11;
 
 namespace paddle {
-namespace framework {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
+namespace pybind {
 static size_t UniqueIntegerGenerator() {
   static std::atomic<size_t> generator;
   return generator.fetch_add(1);
@@ -56,6 +51,10 @@ bool IsCompileGPU() {
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
+  // using framework in this function. Since it is inside a function, it will
+  // not cause namespace pollution.
+  using namespace paddle::framework;  // NOLINT
+
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
@@ -107,7 +106,7 @@ PYBIND11_PLUGIN(core) {
 #ifdef PADDLE_ONLY_CPU
             new (&instance) LoDTensor(lod);
 #else
-             paddle::framework::LoD new_lod;
+             LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              new (&instance) LoDTensor(new_lod);
@@ -118,7 +117,7 @@ PYBIND11_PLUGIN(core) {
 #ifdef PADDLE_ONLY_CPU
              self.set_lod(lod);
 #else
-             paddle::framework::LoD new_lod;
+             LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              self.set_lod(new_lod);
@@ -132,7 +131,7 @@ PYBIND11_PLUGIN(core) {
            std::vector<std::vector<size_t>> new_lod;
            new_lod.reserve(lod.size());
            std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
-               [](paddle::framework::Vector<size_t> item) ->
+               [](Vector<size_t> item) ->
                    std::vector<size_t> {
                  std::vector<size_t> v;
                  v.reserve(item.size());
@@ -317,5 +316,5 @@ All parameter, weight, gradient are variables in Paddle.
 
   return m.ptr();
 }
-}  // namespace framework
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 95171acf729a513e5c92d1e0cba15cb12b38561a..bcfba84a1aa6e646cf255dc4612dfda42169fc44 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -23,7 +23,7 @@ namespace py = pybind11;
 
 namespace paddle {
 
-namespace framework {
+namespace pybind {
 
 namespace details {
 
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 6cca41e43b38b8cccb65ff9b347ef226dddecd4d..9086a5cc3452b178ec37fe6a3e358eaa4c5d606b 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -89,12 +89,16 @@ class OpDescCreationMethod(object):
                     new_attr.f = user_defined_attr
                 elif attr.type == framework_pb2.STRING:
                     new_attr.s = user_defined_attr
+                elif attr.type == framework_pb2.BOOLEAN:
+                    new_attr.b = user_defined_attr
                 elif attr.type == framework_pb2.INTS:
                     new_attr.ints.extend(user_defined_attr)
                 elif attr.type == framework_pb2.FLOATS:
                     new_attr.floats.extend(user_defined_attr)
                 elif attr.type == framework_pb2.STRINGS:
                     new_attr.strings.extend(user_defined_attr)
+                elif attr.type == framework_pb2.BOOLEANS:
+                    new_attr.bools.extend(user_defined_attr)
                 elif attr.type == framework_pb2.INT_PAIRS:
                     for p in user_defined_attr:
                         pair = new_attr.int_pairs.add()
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
index d314ce391ea2f10a8bd77c24e84fa3e1eebb6c73..47557ccb41d1e835b5d04d1b94f54dfc7aa2855a 100644
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -24,15 +24,15 @@ class TestCosSimOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+            ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y'))
 
 
 class TestCosSimOp2(TestCosSimOp):
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 0206ca064be87afe204aa99021979b7ddc3c5d63..f10db783225c07be9ffde25267fdfe096e97ecac 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest):
             dtype="float32")
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {'soft_label': 0}
+        self.attrs = {'soft_label': False}
 
     def test_check_output(self):
         self.check_output()
@@ -45,7 +45,7 @@ class TestCrossEntropyOp2(OpTest):
             axis=1, keepdims=True).astype("float32")
         self.inputs = {'X': X, 'Label': label}
         self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+        self.attrs = {'soft_label': True}
 
     def test_check_output(self):
         self.check_output()
@@ -76,7 +76,7 @@ class TestCrossEntropyOp3(OpTest):
             axis=1, keepdims=True).astype("float32")
         self.inputs = {'X': X, 'Label': label}
         self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+        self.attrs = {'soft_label': True}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_dropout_op.py b/python/paddle/v2/framework/tests/test_dropout_op.py
index 3638fee1a1c26195791bc1f5a46dd749da0aee95..29fc702791184aaacf335e13bcc6d03082bb49a6 100644
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
@@ -7,7 +7,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
         self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64))}
 
     def test_check_output(self):
@@ -21,7 +21,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 1.0, 'is_training': True}
         self.outputs = {'Out': np.zeros((32, 64)), 'Mask': np.zeros((32, 64))}
 
 
@@ -29,7 +29,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
         self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64, 2))}
 
 
@@ -37,7 +37,7 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_training': 0}
+        self.attrs = {'dropout_prob': 0.35, 'is_training': False}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
@@ -48,7 +48,7 @@ class TestDropoutOp5(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.75, 'is_training': 0}
+        self.attrs = {'dropout_prob': 0.75, 'is_training': False}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3101a709b8bcf58e8682ab3d0ca5217a7f3572d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseAddOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_div_op.py b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..41cb2b7767eb8e01e46e770a5da21b609f4eb911
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
@@ -0,0 +1,105 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class ElementwiseDivOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        """ Warning
+        CPU gradient check error!
+        'X': np.random.random((32,84)).astype("float32"),
+        'Y': np.random.random((32,84)).astype("float32")
+        """
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+
+
+class TestElementwiseDivOp_Vector(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [32]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [32]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2]).astype("float32")
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [4]).astype("float32")
+        }
+
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3, 4]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
index e268cfddb26721a35ddd2d2cc18f526ff7b2f6d9..cee4385a8176f7a441a280e3cd40c39ca51493c5 100644
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
@@ -3,14 +3,9 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestElementwiseMulOp_Matrix(OpTest):
+class ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
-        """ Warning
-        CPU gradient check error!
-        'X': np.random.random((32,84)).astype("float32"),
-        'Y': np.random.random((32,84)).astype("float32")
-        """
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
             'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
@@ -32,7 +27,7 @@ class TestElementwiseMulOp_Matrix(OpTest):
             ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
 
 
-class TestElementwiseMulOp_Vector(OpTest):
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -41,22 +36,8 @@ class TestElementwiseMulOp_Vector(OpTest):
         }
         self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_0(OpTest):
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -69,22 +50,8 @@ class TestElementwiseMulOp_broadcast_0(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(2, 1, 1)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_1(OpTest):
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -97,22 +64,8 @@ class TestElementwiseMulOp_broadcast_1(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 3, 1)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_2(OpTest):
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -124,22 +77,8 @@ class TestElementwiseMulOp_broadcast_2(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 4)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_3(OpTest):
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
diff --git a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..be982e8c57b30b91c2834bd5db38ea3c89f573ee
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseSubOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
index 2473daaba24438819f3f55ccc40fe1c64ee59960..eff8fa87d9c0dafc6935604101e94ee6c8b081ce 100644
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
@@ -6,8 +6,8 @@ from op_test import OpTest
 class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
-        self.inputs = {'Src': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Dst': np.zeros_like(self.inputs["Src"])}
+        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
+        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_multiplex_op.py b/python/paddle/v2/framework/tests/test_multiplex_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2b3881cde24c7fb96c3d7f9411352bc62d55077
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
@@ -0,0 +1,43 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMultiplexOp(OpTest):
+    def setUp(self):
+        self.op_type = "multiplex"
+        rows = 3
+        index = np.array([3, 1, 0])
+        ins1 = np.random.random((rows, 10)).astype("float32")
+        ins2 = np.random.random((rows, 10)).astype("float32")
+        ins3 = np.random.random((rows, 10)).astype("float32")
+        ins4 = np.random.random((rows, 10)).astype("float32")
+        self.inputs = {
+            'X': [('index', index), ('x1', ins1), ('x2', ins2), ('x3', ins3),
+                  ('x4', ins4)]
+        }
+        # multiplex output
+        output = np.zeros_like(ins1)
+        for i in range(0, rows):
+            k = index[i] + 1
+            output[i] = self.inputs['X'][k][1][i]
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x1', 'x2', 'x3', 'x4'], 'Out')
+
+    def test_check_grad_ignore_x1(self):
+        self.check_grad(['x2', 'x3', 'x4'], 'Out', no_grad_set=set('x1'))
+
+    def test_check_grad_ignore_x1_x2(self):
+        self.check_grad(['x3', 'x4'], 'Out', no_grad_set=set(['x1', 'x2']))
+
+    def test_check_grad_ignore_x3(self):
+        self.check_grad(['x1', 'x2', 'x4'], 'Out', no_grad_set=set('x3'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 79eda70021b76cd06e4c40740b1ca49476f4c503..cc3d4776e26a9dcaf9cf8403e0a1d0fca1d2ebae 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -3,6 +3,7 @@ import paddle.v2.framework.core as core
 import unittest
 import numpy as np
 from paddle.v2.framework.op import Operator, RecurrentOp
+from op_test import get_numeric_gradient
 
 
 def py_sigmoid(x):
@@ -47,7 +48,7 @@ class PySimpleRNN(object):
         else:
             pre_mem = self.h_boot
         xW = np.matmul(x, self.W)
-        hU = np.matmul(mem, self.U)
+        hU = np.matmul(pre_mem, self.U)
 
         sum = xW + hU
         self.mems[step_id] = py_sigmoid(sum)
@@ -68,7 +69,7 @@ def create_tensor(scope, name, shape, np_data):
     return tensor
 
 
-class TestRecurrentOp(unittest.TestCase):
+class RecurrentOpTest(unittest.TestCase):
     '''
     Test RNNOp
 
@@ -158,6 +159,42 @@ class TestRecurrentOp(unittest.TestCase):
         print
         print 'py_output', py_output
         self.assertEqual(pd_output.shape, py_output.shape)
+        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
+
+
+class RecurrentGradientOpTest(unittest.TestCase):
+    def create_forward_op(self):
+        self.forward_op = RecurrentOp(
+            # inputs
+            inlinks=["x"],
+            boot_memories=["h_boot"],
+            step_net="stepnet",
+            # outputs
+            outlinks=["h"],
+            step_scopes="step_scopes",
+            # attributes
+            pre_memories=["h@pre"],
+            memories=["h@alias"])
+
+        # create a stepnet for RNN
+        stepnet = core.Net.create()
+        x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@alias")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            stepnet.append_op(op)
+        stepnet.complete_add_op(True)
+        self.forward_op.set_stepnet(stepnet)
+
+    def create_gradient_op(self):
+        a = set()
+        backward_op = core.RecurrentOp.backward(self.forward_op, a)
+
+    def test_grad(self):
+        self.create_forward_op()
+        self.create_gradient_op()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index cf864936af6361da1f16df3cfb759b468214b970..211086e5f4de32b996f0fa27c2eb52670c2b1e11 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -3,20 +3,37 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestSeqAvgPool1D(OpTest):
-    def setUp(self):
-        self.op_type = 'sequence_avg_pool'
+class SeqPoolType(OpTest):
+    AVERAGE = 0
+    SUM = 1
+    SQRT = 2
+    MAX = 3
+    LAST = 4
+    FIRST = 5
+
+
+class TestSeqAvgPool(OpTest):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
         lod = [[0, 4, 5, 8, 11]]
+        self.inputs = {'X': (x, lod)}
 
         out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.AVERAGE}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
         for i in range(4):
             sub_x = x[lod[0][i]:lod[0][i + 1], :]
             out[i] = sub_x.mean(axis=0)
 
-        self.inputs = {'X': (x, lod)}
-        self.outputs = {'Out': out}
+    def setUp(self):
+        self.set_data()
+        self.compute()
 
     def test_check_output(self):
         self.check_output()
@@ -25,26 +42,44 @@ class TestSeqAvgPool1D(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestSeqAvgPool2D(OpTest):
-    def setUp(self):
-        self.op_type = 'sequence_avg_pool'
+class TestSeqAvgPool2D(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
         lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
 
         out = np.zeros((4, 3, 17)).astype('float32')
+        self.outputs = {'Out': out}
+
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.AVERAGE}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
         for i in range(4):
             sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
             out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
-        self.inputs = {'X': (x, lod)}
-        self.outputs = {'Out': out}
 
-    def test_check_output(self):
-        self.check_output()
+class TestSeqSumPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SUM}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x.sum(axis=0)
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+
+class TestSeqSumPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SUM}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
 if __name__ == '__main__':