diff --git a/.travis.yml b/.travis.yml
index a406841f6abf01f15826f34fe4c63b4c24486ccd..361136ac2c8d899a0d7a4d7945083fcc489551b5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,15 +27,6 @@ script:
     # 43min timeout
     paddle/scripts/paddle_docker_build.sh ${JOB}
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
-  - |
-    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
-    # For document only
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
-    export DOCS_DIR=`pwd`
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
   email:
     on_success: change
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cab76e8f01c571c931398f6492aa9aeeebf1f08..f56c5d382af8cdfb5a941ee272a0f8d22ec04d67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 
@@ -159,6 +160,7 @@ endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
+include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
@@ -175,6 +177,7 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
+include(external/cub)
 
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c94849cf4b96746e6c507db2a6310c2f305dacf5
--- /dev/null
+++ b/cmake/external/cub.cmake
@@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
+set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
+
+include_directories(${CUB_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cub
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
+  GIT_TAG        "v1.8.0"
+  PREFIX         ${CUB_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(cub STATIC ${dummyfile})
+else()
+  add_library(cub INTERFACE)
+endif()
+
+add_dependencies(cub extern_cub)
+
+LIST(APPEND externl_project_dependencies cub)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..384c2f9328296ce6a8a6293be6cc47e5063dd3c4
--- /dev/null
+++ b/cmake/external/xbyak.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(WITH_XBYAK ON)
+if(WIN32 OR APPLE)
+    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+    return()
+endif()
+
+include(ExternalProject)
+
+set(XBYAK_PROJECT       extern_xbyak)
+set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
+set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
+set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
+
+include_directories(${XBYAK_INC_DIR})
+include_directories(${XBYAK_INC_DIR}/xbyak)
+
+add_definitions(-DPADDLE_WITH_XBYAK)
+
+# xbyak options
+add_definitions(-DXBYAK64)
+add_definitions(-DXBYAK_NO_OP_NAMES)
+
+ExternalProject_Add(
+    ${XBYAK_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ""
+    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
+    GIT_TAG             "v5.661"  # Jul 26th
+    PREFIX              ${XBYAK_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
+    add_library(xbyak STATIC ${dummyfile})
+else()
+    add_library(xbyak INTERFACE)
+endif()
+
+add_dependencies(xbyak ${XBYAK_PROJECT})
+list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 07bab994d354df834d0667c69f307b2d7684fb22..82c958073cba92f00a341121e36ba45531b22aec 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -264,7 +264,10 @@ function(cc_test TARGET_NAME)
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -329,7 +332,10 @@ function(nv_test TARGET_NAME)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             FLAGS_cpu_deterministic=true
+             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -1,21 +1,27 @@
 # 如何使用timeline工具做性能分析
 
-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
 
 	**提示：**
 	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[])
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
 
 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
 
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -1,15 +1,17 @@
 # how to use timeline tool to do profile
 
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
@@ -17,6 +19,10 @@
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
 
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
 
 	![chrome tracing](./tracing.jpeg)
diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6e48f4f58269b67450cb012f6dcc59e1083abba
--- /dev/null
+++ b/doc/survey/op_fusion_design.md
@@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 3ef317bb7a1c25c5738342f34ae7994b0184a7de..dd172ff9c97814c089ddb2e5bf729880cf0c9cdb 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d274d96c29bdbf5973d568d783369c3975bdc436..2577e59d9cf24c26b7c04aa00cdde6cde17f7206 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -5,5 +5,7 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+if(WITH_INFERENCE)
+  # NOTE: please add subdirectory inference at last.
+  add_subdirectory(inference)
+endif()
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 700c73c745bad72637d77385f5cd38c494501c86..bf493a3fa44e48deec734250d04b2a413c3ed9da 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("all_reduce", nullptr);
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index b2e5399e2376a86c1cd310b29c768832665af87f..8714a42162bda3d5ad12e7925fe8cc4e693f51b1 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -21,6 +21,26 @@ namespace framework {
 namespace details {
 
 struct BuildStrategy {
+  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
+  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
+  // optimize their parameters separately. If you use kReduce, the optimizations
+  // of parameters are distributed to different threads.
+  // For example, a model has 100 parameters and is running with four threads,
+  // if you choose kAllReduce, every thread is to optimize 100 parameters
+  // separately, if you choose kReduce, every thread is to optimize 25
+  // parameters.
+  // Of particular note is, if you use kReduce when using CPU training,
+  // all the parameters are shared between different threads. This feature will
+  // save memory.
+  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
+  // equal for GPU. Because, the result of the different order of summing maybe
+  // different, for example, the result of `a+b+c+d` may be different with the
+  // result of `c+a+b+d`.
+  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
+  // so the result of kAllReduce and kReduce maybe not equal.
+  // For CPU, if you want to fix the order of summing to make the result
+  // of kAllReduce and kReduce no diff, you can add
+  // `FLAGS_cpu_deterministic=true` to env.
   enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
 
   enum class GradientScaleStrategy {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 5ca2ed8f96244a11925dfa6af8e48458cf334ecd..a4fdbcb26d1d0cfb05edebff5419d9559c336b3a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -275,7 +275,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
       if (strategy_.gradient_scale_ !=
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         // TODO(paddle-dev): Why is there no input for this op_handle?
-        CreateScaleLossGradOp(&result);
+        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
+        CreateScaleLossGradOp(&result, loss_grad_name);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -535,7 +536,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
   return got == sharded_var_device.end() ? -1 : got->second;
 }
 
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
+    ir::Graph *result, const std::string &loss_grad_name) const {
   for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
@@ -558,10 +560,10 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
     // loss->pending_ops_.emplace_back(op_handle);
     // op_handle->inputs_.emplace_back(loss);
 
-    CreateOpOutput(result, op_handle,
-                   result->CreateEmptyNode(GradVarName(loss_var_name_),
-                                           ir::Node::Type::kVariable),
-                   places_[i], i);
+    CreateOpOutput(
+        result, op_handle,
+        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
+        places_[i], i);
   }
 }
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 099dbe5abef6458c4613c9f680440734f59cb6e2..f2cb6bb1c861e07f1034f1742ad4f3cfbb0d8837 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -75,7 +75,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
-  void CreateScaleLossGradOp(ir::Graph *result) const;
+  void CreateScaleLossGradOp(ir::Graph *result,
+                             const std::string &loss_grad_name) const;
+
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
   void CreateComputationalOp(ir::Graph *result, ir::Node *node,
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7160e346dad0615e2fd32b70c096880af0359e1a..6c7e5c1fb06620b1c071b00fcfcc1b4a29bf8d62 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,12 +16,18 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(
+    cpu_deterministic, false,
+    "Whether to make the result of computation deterministic in CPU side.");
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("reduce", nullptr);
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
   auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
   } else {
     std::vector<const LoDTensor *> lod_tensors =
         GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
     if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
       this->RunAndRecordEvent([&] {
-        ReduceLoDTensor func(lod_tensors,
-                             out_var->GetMutable<framework::LoDTensor>());
-        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        // FIXME(zcd): The order of summing is important,
+        // especially when the type of data is float or double.
+        // For example, the result of `a+b+c+d` may be different
+        // with the result of `c+a+b+d`, so the summing order should be fixed.
+        if (!FLAGS_cpu_deterministic) {
+          ReduceLoDTensor func(lod_tensors,
+                               out_var->GetMutable<framework::LoDTensor>());
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        } else {
+          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
+          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
+          auto &reduce_sum_trg = *this->local_scopes_[0]
+                                      ->FindVar(kLocalExecScopeName)
+                                      ->Get<Scope *>()
+                                      ->FindVar(out_var_handle->name_)
+                                      ->GetMutable<framework::LoDTensor>();
+          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+
+          auto trg = out_var->GetMutable<framework::LoDTensor>();
+          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
+          }
+        }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 1d80bab90f513139f807b57258177c6b2ac53ac0..5bd974d6b789a2f085c0a69de5e133187342f587 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     eptr = std::current_exception();
   }
 
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index e556c84b0219eba2b92c456c205e03947171626b..0eaf9a9c951991a5775604eb8d0e7535f81a4ae2 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<platform::RecordEvent> event(
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   // Clean run context
   run_op_futures_.clear();
   exception_holder_.Clear();
+  event.reset(nullptr);
 
   // Step 3. Execution
   while (!pending_vars.empty()) {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c2800c972a5501859672fbfd6921499e84d09cb0..dad170ed78c64202b5c812bd8682887fe3b736d6 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   for (auto& op : ctx->ops_) {
-    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
-    // NOTE! Please do not delete this line, it's usefull because the debug
-    // string before and after op.run are different, after run the output
-    // will have right shape which is usefull for debug.
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
 
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index f870fb2b9cf805aba84d6f4573b0574ff361e71c..f87d5212c0cd87a5a63cf2d54ca677516ab45816 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -182,9 +182,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
   }
 
   /**
-   * We only handle write after read(WAR), since it should not have a write
-   * after write in program. If there are write after write operators, we need
-   * prune them.
+   * We should handle write after read(WAR) and write after write(WAW) here.
+   * Because some of the operators of the program can be executed parallelly.
+   * So, to make the program running in the right order, we should add the
+   * dependence of WAR and WAW.
+   *
    *
    * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
    */
@@ -201,6 +203,19 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
           (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
       const auto &read_ops = (*it_old)->outputs;
 
+      PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
+
+      // Add write after write dependence
+      ir::Node *upstream_op =
+          (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
+      if (upstream_op) {
+        ir::Node *dep_var = CreateControlDepVar();
+        write_op->inputs.push_back(dep_var);
+        upstream_op->outputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+        dep_var->inputs.push_back(upstream_op);
+      }
+
       for (auto *read_op : read_ops) {
         // Manually add a dependency var from read_op to write_op;
         if (read_op == write_op) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index cdac00739bc48648b41751e644a953d0d310ffbf..d04f7744961b2561977f4d36d0073a97557043da 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -127,7 +127,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(10) << "- " << DebugStringEx(&scope);
+  VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -136,8 +136,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
   RunImpl(scope, place);
-  VLOG(10) << "+ " << DebugStringEx(&scope);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -639,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  // For profiling, don't move out of this function because that will result
-  // in the failure of multi-GPU profiling.
-  platform::RecordEvent record_event(Type(), dev_ctx);
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
@@ -779,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
+  std::string last_input_name;
   for (auto& input : this->inputs_) {
     for (auto& ipt_name : input.second) {
       auto* var = scope.FindVar(ipt_name);
@@ -795,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
-              data_type, tmp);
+              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
+              Type(), last_input_name, data_type, ipt_name, tmp);
           data_type = tmp;
+          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 98bdfcc00b9f0e8f40dfc92e4021b2bd6fb19313..c4ab26a2288bb9d8f3cd54a797d2062e0606b219 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -24,7 +24,7 @@
 
 namespace paddle {
 
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
             "Enable subgraph to TensorRT engine for acceleration");
 
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
@@ -42,10 +42,19 @@ class DfgPassManagerImpl final : public DfgPassManager {
     // TODO(Superjomn) set the key with pass reprs.
     AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
     if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [](const Node* node) {
+      auto trt_teller = [&](const Node* node) {
+        std::unordered_set<std::string> teller_set(
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
         if (!node->IsFunction()) return false;
-        return static_cast<const Function*>(node)->func_type() == "mul";
+
+        const auto* func = static_cast<const Function*>(node);
+        if (teller_set.count(func->func_type()))
+          return true;
+        else {
+          return false;
+        }
       };
+
       AddPass("tensorrt-subgraph-marker",
               new TensorRTSubgraphNodeMarkPass(trt_teller));
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 8a3af0a8ebd5bad7be7046fa399cca4920da3d71..7f64bc75ae8ad40a268739cdc36051e76af9f49a 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
                         std::vector<Node *>(outputs.begin(), outputs.end()));
 }
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
+  std::vector<Node *> op_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.type() == Node::Type::kValue || node.deleted()) {
+      continue;
+    }
+    op_nodes.push_back(&node);
+  }
+  size_t op_num = op_nodes.size();
+  for (size_t i = 0; i < op_num; i++) {
+    if (op_nodes[i]->type() == Node::Type::kFunction) continue;
+    std::unordered_set<std::string> follow_up_input_names;
+    for (size_t j = i + 1; j < op_num; j++) {
+      for (auto *in : op_nodes[j]->inlinks) {
+        follow_up_input_names.insert(in->name());
+      }
+    }
+    std::vector<Node *> filtered_subgraph_outlinks;
+    for (auto *out : op_nodes[i]->outlinks) {
+      if (follow_up_input_names.count(out->name())) {
+        filtered_subgraph_outlinks.push_back(out);
+      }
+    }
+    PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
+    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 16aeae4d35e7bd54646053190da7f47eaca69aa0..bb3ec6bbc1d9555386aba8837b019d2511653258 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -178,6 +178,7 @@ struct GraphTraits<DataFlowGraph> {
 std::pair<std::vector<Node *>, std::vector<Node *>>
 ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 2328d870422c5a31c22d7b09980aae35e01b2b25..18c32fa09199003f17183207828cdfe4e627ae1a 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -23,7 +23,7 @@
 namespace paddle {
 namespace inference {
 
-DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
 DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
 
 namespace analysis {
@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
 void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  FilterRedundantOutputOfSubGraph(graph);
   LOG(INFO) << "graph.inputs " << graph->inputs.size();
   for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
     if (node.deleted()) continue;
@@ -87,34 +88,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
 }
 
 void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
-                       const framework::proto::BlockDesc &block) {
+                       framework::proto::BlockDesc *block) {
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
   auto *func = static_cast<FunctionBlock *>(node);
 
   // collect inputs
-  std::vector<std::string> io;
+  std::unordered_set<std::string> input_names;
   for (auto *x : func->inlinks) {
-    io.push_back(x->name());
+    input_names.insert(x->name());
   }
-  desc.SetInput("Xs", io);
+  desc.SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
-  // collect outputs
-  io.clear();
+  std::unordered_set<std::string> output_names;
   for (auto *x : func->outlinks) {
-    io.push_back(x->name());
+    output_names.insert(x->name());
   }
-  desc.SetOutput("Ys", io);
+
+  std::vector<std::string> output_temp(output_names.begin(),
+                                       output_names.end());
+  desc.SetOutput("Ys", output_temp);
   desc.SetType("tensorrt_engine");
 
-  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to tensorrt OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the correspondin ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto subgraph_nodes = func->subgraph;
+  for (int index = 0; index < block->ops_size(); index++) {
+    framework::proto::OpDesc *op = block->mutable_ops(index);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inlinks) {
+      var2id[in_var->name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {
+        std::string arg_value = in_var->arguments(k);
+        if (input_names.count(arg_value)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value +
+                                   std::to_string(var2id[arg_value]));
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outlinks) {
+      var2id[out_var->name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        if (output_names.count(arg_value)) {
+          output_name_map[arg_value] =
+              arg_value + std::to_string(var2id[arg_value]);
+        }
+        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+  // When tensorrt engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
   // Set attrs
-  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
   SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
   SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
   SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
   node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 
@@ -146,15 +226,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   LOG(INFO) << "transformed variable size: "
             << block_desc.Proto()->vars().size();
   // copy ops.
+
   for (auto *node : block_node->subgraph) {
     auto *op = block_desc.AppendOp();
     PADDLE_ENFORCE(!node->pb_msg().empty());
     op->Proto()->ParseFromString(node->pb_msg());
   }
+
   *block_desc.Proto()->mutable_vars() =
       argument_->origin_program_desc->blocks(0).vars();
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
   auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index a6f85484756417e103cbb60bcb664e8b800b9f28..c05b0e5d4690d0a447edf63a149903704bc2c9be 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
   for (size_t i = 0; i < graph->nodes.size(); i++) {
     const Node &node = graph->nodes.Get(i);
     if (!config_.display_deleted_node && node.deleted()) continue;
-    for (auto &in : node.inlinks) {
-      if (!config_.display_deleted_node && in->deleted()) continue;
-      dot.AddEdge(in->repr(), node.repr(), {});
+    for (auto &out : node.outlinks) {
+      if (!config_.display_deleted_node && out->deleted()) continue;
+      dot.AddEdge(node.repr(), out->repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 389f9e1a9148a4daf0e5b751cce5cb6325252a4e..80809d4c43ca08298bad25cf614dcb4117d3f99a 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
 
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 259d79bedbf664f52b1189ca71567665a6d91180..08d0f493ab30d92a121d089d9003bc575429b4dd 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -74,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
     target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
     if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
-                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api_shared)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        # this test is unstable, disable it first.
+        #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
+                                  #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  #DEPS inference_anakin_api_shared)
+        #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
      endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index e74f23ff969f5a8f58a71da337c16dcbc14f10c0..63c3f0d7b3f5c2b9246e2b041796caf5eb562826 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
 PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
 
 PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
   // only the buffer with external memory can be copied
-  assert(!other.memory_owned_);
   data_ = other.data_;
   length_ = other.length_;
   memory_owned_ = other.memory_owned_;
+  other.data_ = nullptr;
+  other.length_ = 0;
+  other.memory_owned_ = false;
   return *this;
 }
 
 void PaddleBuf::Resize(size_t length) {
   // Only the owned memory can be reset, the external memory can't be changed.
   if (length_ == length) return;
-  assert(memory_owned_);
-  Free();
+  if (memory_owned_) {
+    Free();
+  }
   data_ = new char[length];
   length_ = length;
   memory_owned_ = true;
@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
     assert(length_ > 0);
-    delete static_cast<char*>(data_);
+    delete[] static_cast<char*>(data_);
     data_ = nullptr;
     length_ = 0;
   }
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 59b0df7968cce137843ba8cad38a62fdb8d3bfc1..b24414e8245b1a4d90acce4fa1ad5690e06b47dd 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -40,11 +40,12 @@ class PaddleBuf {
   // Copy only available when memory is managed externally.
   explicit PaddleBuf(const PaddleBuf&);
   PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
   // Do not own the memory.
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  explicit PaddleBuf(size_t length)
+  PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index d86c046f2e5b08a4c00cf6cad19627e6a196c798..8f42a37cd3f8978b917b42e8f45a128b8422aa57 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+activation_op.cc
   DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 1b6a0ad82f3ceb00cec15c28c8121adc22271b7a..41faaf7212accaaec238062b1340e8da8fa6be33 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -55,7 +55,6 @@ class OpConverter {
         it = Registry<OpConverter>::Lookup("fc");
       }
     }
-
     if (op_desc.Type().find("elementwise") != std::string::npos) {
       static std::unordered_set<std::string> add_tensor_op_set{
           "add", "mul", "sub", "div", "max", "min", "pow"};
@@ -72,6 +71,8 @@ class OpConverter {
                        "Unsupported elementwise type" + op_type);
         it =
             Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
+        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                                op_desc.Type());
       } else {
         PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
                        "Unsupported elementwise type" + op_type);
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 0511eb42a073ac305634110a71a35e501f062132..f07ab5a33b87d7945e5fcdf8f3644f0711ce643b 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -280,12 +280,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      * ('any') which lets a primitive (convolution in this case) choose
      * the memory format preferred for best performance
      */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
     auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // create a conv primitive descriptor and save it for usage in backward
     std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
@@ -423,16 +427,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      * ('any') which lets a primitive (conv backward in this case) choose
      * the memory format preferred for best performance
      */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
     auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // Retrieve conv_pd from device context
     auto conv_pd =
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index eb8272e90c32c3a0be2c0ce1bc679571af876317..bc3e95e904f8b6c2cdd2ae6685bf67580178e6b6 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& dout, int axis,
                          framework::Tensor* dx, framework::Tensor* dy,
                          DX_OP dx_op, DY_OP dy_op) {
-  const framework::DDim x_dim = x.dims();
-  const framework::DDim y_dim = y.dims();
+  const framework::DDim& x_dim = x.dims();
+  const framework::DDim& y_dim = y.dims();
   if (x.dims() == y.dims()) {
     ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
         ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
@@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
                                  framework::Tensor* dx, framework::Tensor* dy,
                                  DX_OP dx_op, DY_OP dy_op) {
   if (dy == nullptr) {
-    const framework::DDim dx_dims = dout.dims();
+    const framework::DDim& dx_dims = dout.dims();
     auto dy_dims = dx_dims;
     ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
         ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {
     if (dout.dims() == dy->dims()) {
-      const framework::DDim dx_dims = dout.dims();
-      const framework::DDim dy_dims = dy->dims();
+      const framework::DDim& dx_dims = dout.dims();
+      const framework::DDim& dy_dims = dy->dims();
       ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
           ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
     } else {  // Y is a scalar
       auto dx_dims = dout.dims();
-      const framework::DDim dy_dims = dy->dims();
+      const framework::DDim& dy_dims = dy->dims();
       ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
           ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
     }
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0..dc7ef664958238ddbd48745bd59cc7db28e49f5b 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     // get device context from pool
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
 
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 680fde19eefe57475b7526ebc29d4ff977a16977..d9cd956dfdff3d009d38ee5088f5396080580483 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 1640a2a22c69a0e3ab81a2889d6105b2cf4162b7..c197b45e8196a47def6465128e8ca39d8daefed6 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6fd0aeb021dce40339c32251af130d5984dccd2
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Y"),
+        "Input(Y) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FusedElemwiseActivationOp op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(vector<Tensor>)");
+    AddInput("Y", "(vector<Tensor>)");
+    AddOutput("Out", "vector<Tensor>");
+    AddAttr<int>("axis",
+                 "axis is used by elementwise_op, the default value is -1.")
+        .SetDefault(-1);
+    AddAttr<float>("scale",
+                   "scale is used by scale_op, the default value is 0.0.")
+        .SetDefault(0.0);
+    AddAttr<bool>("recomputation",
+                  "Whether to recompute the Out."
+                  "fused_elemwise_activation_grad has two methods to get the "
+                  "dx and dy, one "
+                  "is to use the 'Out', and the other is not to use it. "
+                  "The former method will save the time of recomputing the "
+                  "'Out', but it must occupy the memory to store the 'out'. "
+                  "While, the later method can avoid occupying the memory, "
+                  "but it must recompute the 'Out'. The default value is true.")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>("functor_list",
+                                      "The functors that should be fused.")
+        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
+          PADDLE_ENFORCE(ValidCheck(functor_list));
+        });
+
+    AddComment(R"DOC(
+FusedElemwiseActivation Operator.
+
+At present, FusedElemwiseActivation only supports Two kinds of compound
+operators (elementwise_op and activation_op):
+
+    Z = Binary(X, Unary(Y))
+    Z = Unary(Binary(X, Y))
+
+The attributions of activation_op can be get from fused_elemwise_activation_op's
+attributions. functor_list records the functors to be fused, for example
+"scale,elementwise_add".
+
+)DOC");
+  }
+
+ private:
+  bool ValidCheck(const std::vector<std::string> &functors) {
+    std::unordered_set<std::string> unary_fun = {"scale", "relu"};
+    std::unordered_set<std::string> binary_fun = {"elementwise_add"};
+
+    std::string unary_fun_str;
+    if (binary_fun.count(functors[0])) {
+      unary_fun_str = functors[1];
+    } else if (binary_fun.count(functors[1])) {
+      unary_fun_str = functors[0];
+    } else {
+      PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
+                   functors[1]);
+    }
+    PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
+                      "%s is not included in fused_list.", unary_fun_str);
+    return true;
+  }
+};
+
+class FusedElemwiseActivationGradMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType(this->ForwardOpType() + "_grad");
+
+    for (auto &input_param : this->InputNames()) {
+      op_desc_ptr->SetInput(input_param, this->Input(input_param));
+      op_desc_ptr->SetOutput(framework::GradVarName(input_param),
+                             this->InputGrad(input_param, true));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      op_desc_ptr->SetInput(output_param, this->Output(output_param));
+      op_desc_ptr->SetInput(framework::GradVarName(output_param),
+                            this->OutputGrad(output_param));
+    }
+    op_desc_ptr->SetAttrMap(this->Attrs());
+
+    std::vector<std::string> functor_names =
+        boost::get<std::vector<std::string>>(
+            op_desc_ptr->GetAttr("functor_list"));
+    functor_names[0] += "_grad";
+    functor_names[1] += "_grad";
+    op_desc_ptr->SetAttr("functor_list", functor_names);
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type_index = ctx.Input<framework::Tensor>("X")->type();
+    PADDLE_ENFORCE_EQ(input_data_type_index,
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    PADDLE_ENFORCE_EQ(
+        input_data_type_index,
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        "The element's type of input should be the same.");
+
+    auto input_data_type = framework::ToDataType(input_data_type_index);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
+                  ops::FusedElemwiseActivationMaker,
+                  ops::FusedElemwiseActivationGradMaker);
+REGISTER_OPERATOR(fused_elemwise_activation_grad,
+                  ops::FusedElemwiseActivationOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       double>);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused_elemwise_activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1d2b16b4b5e3a480777f834c2cbeb6d00a755e4
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe0017b824532b1210d0ae3e51983d63d081f12a
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
@@ -0,0 +1,425 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/functors.h"
+
+namespace math = paddle::operators::math;
+
+namespace paddle {
+namespace operators {
+
+// CompoundFunctors
+// For example: Z = Binary(X, Unary(Y))
+template <typename T, typename BinaryFun, typename UnaryFun>
+struct BinaryCompoundFunctor {
+  BinaryCompoundFunctor(const BinaryFun &binary_fun, const UnaryFun &unary_fun)
+      : binary_fun_(binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return binary_fun_(x, unary_fun_(y));
+  }
+
+ private:
+  BinaryFun binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+// For example: Z = Unary(Binary(X, Y))
+template <typename T, typename UnaryFun, typename BinaryFun>
+struct UnaryCompoundFunctor {
+  UnaryCompoundFunctor(const UnaryFun &unary_fun, const BinaryFun &binary_fun)
+      : unary_fun_(unary_fun), binary_fun_(binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return unary_fun_(binary_fun_(x, y));
+  }
+
+ private:
+  UnaryFun unary_fun_;
+  BinaryFun binary_fun_;
+};
+
+// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get
+// the dx, one is to use the 'out', and the other is not to use it.
+// the former method will save the time of recomputing the
+// 'out', but it must occupy the memory to store the 'out'.
+// While the later method can avoid occupying this memory,
+// but it must recompute the 'out'.
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          bool Recomputation = true>
+struct BinaryCompoundGradDxFunctor {
+  BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun)
+      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(x, unary_fun_(y));
+    } else {
+      return dout * d_binary_fun_(x, unary_fun_(y), out);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          typename DUnaryFun, bool Recomputation = true>
+struct BinaryCompoundGradDyFunctor {
+  BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun,
+                              const DUnaryFun &d_unary_fun)
+      : d_binary_fun_(d_binary_fun),
+        unary_fun_(unary_fun),
+        d_unary_fun_(d_unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(unary_fun_(y), x) * d_unary_fun_(y);
+    } else {
+      return dout * d_binary_fun_(unary_fun_(y), x, out) * d_unary_fun_(y);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+  DUnaryFun d_unary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDxFunctor {
+  UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDyFunctor {
+  UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(y, x);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename DeviceContext, typename T, typename BinaryFunctor,
+          typename UnaryFunctor>
+static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx,
+                                     const BinaryFunctor &binary_functor,
+                                     const UnaryFunctor &unary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+  using BinaryCompoundFunctor =
+      BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>;
+
+  ElementwiseComputeEx<BinaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      BinaryCompoundFunctor(binary_functor, unary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename UnaryFunctor,
+          typename BinaryFunctor>
+static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx,
+                                     const UnaryFunctor &unary_functor,
+                                     const BinaryFunctor &binary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundFunctor =
+      UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>;
+
+  ElementwiseComputeEx<UnaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      UnaryCompoundFunctor(unary_functor, binary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename BinaryGradFunctor,
+          typename UnaryFunctor, typename UnaryGradFunctor,
+          bool Recomputation = true>
+static void RunBinaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const BinaryGradFunctor &binary_grad_functor,
+    const UnaryFunctor &unary_functor,
+    const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using BinaryCompoundDxFunctor =
+      BinaryCompoundGradDxFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  Recomputation>;
+  using BinaryCompoundDyFunctor =
+      BinaryCompoundGradDyFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  UnaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, BinaryCompoundDxFunctor,
+                      BinaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
+      BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
+                              unary_grad_functor));
+}
+
+template <typename DeviceContext, typename T, typename UnaryGradFunctor,
+          typename BinaryFunctor, typename BinaryGradFunctor,
+          bool Recomputation = true>
+static void RunUnaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const UnaryGradFunctor &unary_grad_functor,
+    const BinaryFunctor &binary_functor,
+    const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundDxFunctor =
+      UnaryCompoundGradDxFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+  using UnaryCompoundDyFunctor =
+      UnaryCompoundGradDyFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, UnaryCompoundDxFunctor,
+                      UnaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor),
+      UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor));
+}
+
+template <typename DeviceContext, typename T>
+static void RunFunctors(const framework::ExecutionContext &ctx,
+                        const framework::Tensor *in_x,
+                        const framework::Tensor *in_y,
+                        framework::Tensor *output) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+  // TODO(zcd): The following code can be refined.
+  if (funcs_str == "elementwise_add,scale") {
+    // Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ScaleFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ScaleFunctor<T>(scale), in_x, in_y,
+        output);
+  } else if (funcs_str == "scale,elementwise_add") {
+    // Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ScaleFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ScaleFunctor<T>(scale), math::AddFunctor<T>(), in_x, in_y,
+        output);
+  } else if (funcs_str == "elementwise_add,relu") {
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ReluFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ReluFunctor<T>(), in_x, in_y, output);
+  } else if (funcs_str == "relu,elementwise_add") {
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ReluFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ReluFunctor<T>(), math::AddFunctor<T>(), in_x, in_y, output);
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+static void RunGradFunctors(const framework::ExecutionContext &ctx,
+                            const framework::Tensor *in_x,
+                            const framework::Tensor *in_y,
+                            const framework::Tensor *in_out,
+                            const framework::Tensor *in_out_grad,
+                            framework::Tensor *x_grad,
+                            framework::Tensor *y_grad) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+
+  bool recomputation = ctx.Attr<bool>("recomputation");
+
+  // TODO(zcd): The following code can be refined. for example, use registion
+  if (funcs_str == "elementwise_add_grad,scale_grad") {
+    // The backward of Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    }
+  } else if (funcs_str == "scale_grad,elementwise_add_grad") {
+    // The backward of Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ScaleGradFunctor<T>(scale),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ScaleGradFunctor<T>(scale),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else if (funcs_str == "elementwise_add_grad,relu_grad") {
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    }
+  } else if (funcs_str == "relu_grad,elementwise_add_grad") {
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ReluGradFunctor<T>(),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ReluGradFunctor<T>(),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &output = detail::Ref(ctx.Output<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Output("Out"));
+
+    RunFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &in_out = detail::Ref(ctx.Input<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Input("Out"));
+    auto &in_out_grad =
+        detail::Ref(ctx.Input<framework::Tensor>(framework::GradVarName("Out")),
+                    "Cannot get input tensor %s, variable name = %s",
+                    framework::GradVarName("Out"),
+                    ctx.op().Input(framework::GradVarName("Out")));
+
+    framework::Tensor *x_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    framework::Tensor *y_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    RunGradFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &in_out, &in_out_grad,
+                                      x_grad, y_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a..27e26cb1b5c1e831f05dac299489628b92eaa58c 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
-
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 3e8f3ec5c5cd683343bcbdfc2388bd37c25e00f9..d77b095c5d783a2a9fab87eb8b458117a6a3d225 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -32,11 +32,16 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
 
-    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
 
-    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
 
     if (ctx->GetOutputsVarType("Out")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
@@ -61,8 +66,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Ids",
              "An input with type int32 or int64 "
              "contains the ids to be looked up in W. "
-             "Ids must be a column vector with rank = 2. "
-             "The 2nd dimension size must be 1.");
+             "The last dimension size must be 1.");
     AddOutput("Out", "The lookup results, which have the same type as W.");
     AddAttr<bool>("is_sparse",
                   "(boolean, default false) "
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 27483372b93a850d313445386c7973838c4a0710..74823dab09cac358f647c074ac2f2ee2fed17e55 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -118,28 +118,31 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();
 
       auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_dim[0]);
+      new_rows.resize(ids_num);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       // TODO(yuyang18): Strange code here.
       memory::Copy(platform::CPUPlace(),
                    new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
+                   ids_data, ids_num * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->Resize({ids_num, table->dims()[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
       auto *d_table_data = d_table_value->data<T>();
       auto *d_output_data = d_output->data<T>();
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
                    d_output->numel() * sizeof(T), stream);
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c9f074ca0e8dafb374dc9368165df5af5053a6b8..f5c10ced8305b64c6386c5051804f8c9a8f71802 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -109,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();
 
       framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_dim[0]);
-      for (int64_t i = 0; i < ids_dim[0]; i++) {
+      new_rows.reserve(ids_num);
+      for (int64_t i = 0; i < ids_num; i++) {
         new_rows.push_back(ids_data[i]);
       }
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table_dim[1]});
+      d_table_value->Resize({ids_num, table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
       d_table->set_height(table_dim[0]);
@@ -127,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table_value->data<T>();
 
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
@@ -135,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
 
       int N = table_dim[0];
-      int D = d_output->dims()[1];
+      int D = table_dim[1];
 
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad2f49ccbf5ff37d33cc9e71c1a683571f4f8137
--- /dev/null
+++ b/paddle/fluid/operators/math/functors.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// AddFunctor
+template <typename T>
+struct AddFunctor {
+  // out = x + y;
+  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
+};
+
+template <typename T>
+struct AddGradFunctor {
+  inline HOSTDEVICE T operator()(T x, T y) { return 1; }
+
+  inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; }
+};
+
+template <typename T>
+struct ScaleFunctor {
+  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ScaleGradFunctor {
+  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T x) { return coeff_; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ReluFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
+};
+
+template <typename T>
+struct ReluGradFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index c9744db3d0654ef63357963d9a9a3cb946f56e2d..eb09470f37eabb5524f774bc289fc68f5884c540 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -164,14 +163,11 @@ class ParallelDoOp : public framework::OperatorBase {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, place_idx] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -242,14 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase {
       auto *cur_scope = sub_scopes[i];
 
       // execute
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, i] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(i) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 65fcce8bb019965a805ad09d50be0aba64e4f24e..a0d640b2020958af53a4405ae886eadb2a1e117e 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
             .GetMutable<framework::ReaderHolder>();
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
+
+    // For profiling
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(dev_place);
+    platform::RecordEvent record_event(Type(), &ctx);
+
     reader->ReadNext(&ins);
     if (ins.empty()) {
       if (Attr<bool>("throw_eof_exp")) {
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 1ba684014904e61a86bebacd7d29d7e10d313092..4a6ce938a5f337d035b21f562d46daf606236db0 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index d7f8e994afd7e656bd5a9dd7c5ab45f0d52fe88b..1866a86048acbefadcb4d82cd6309cd16f0352d6 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
     bool sync_mode = Attr<bool>("sync_mode");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 829f310d4233c01a7fbb9ccf7427f6e47ce8d384..3cd42f2d059532b7090e66ce21de8e5cb014adf1 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 5596fa0648ccc151bc0d11de9c556599428a8d71..2bdb23e999621b10799b5163f326bc4b66a437e6 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), X, Out);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_x, &flattened_out);
   }
 };
 
@@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), Out,
-        dOut, dX);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_out, &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 6668e6b9e917eea7ba4a80ac78917b73eb827208..01819f53e3ab0973f6140c5a81f18f954b6a0376 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc;
 
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
-using mkldnn::softmax_forward;
-using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
+using mkldnn::softmax_backward;
+using mkldnn::softmax_forward;
 using mkldnn::stream;
 using platform::to_void_cast;
 
@@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
-    PADDLE_ENFORCE(input->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    const T* input_data = input->data<T>();
-    // allocate memory for output
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+    PADDLE_ENFORCE_EQ(
+        input->dims(), output->dims(),
+        "The shape of softmax's input and output must be identical.");
+
+    // make sure 'output' holds memory, which will be shared by
+    // 'flattened_output' later.
+    output->mutable_data<T>(ctx.GetPlace());
+
+    // flatten input and output to 2-D matrixs
+    auto dims = input->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_input;
+    framework::Tensor flattened_output;
+    flattened_input.ShareDataWith(*input).Resize(flattened_dims);
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+
+    const T* input_data = flattened_input.data<T>();
+    T* output_data = flattened_output.mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(flattened_dims);
+    std::vector<int> dst_tz = src_tz;
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Generate keys for storing/retriving primitives for this operator
@@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* output = ctx.Input<Tensor>("Out");
-    const T* dst_data = output->data<T>();
-
     auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    const auto* diff_dst_ptr = dout->template data<T>();
-
     auto* dx =
         ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
 
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    PADDLE_ENFORCE_EQ(
+        dout->dims(), dx->dims(),
+        "The shape of softmax_grad's input and output must be identical.");
+
+    // make sure 'dx' holds memory, which will be shared by 'flattened_dx'
+    // later.
+    dx->template mutable_data<T>(ctx.GetPlace());
+
+    auto dims = dout->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_output;
+    framework::Tensor flattened_dout;
+    framework::Tensor flattened_dx;
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+    flattened_dout.ShareDataWith(*dout).Resize(flattened_dims);
+    flattened_dx.ShareDataWith(*dx).Resize(flattened_dims);
+
+    const T* dst_data = flattened_output.data<T>();
+    const T* diff_dst_ptr = flattened_dout.template data<T>();
+    T* diff_src_ptr = flattened_dx.template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(flattened_dims);
     std::vector<int> src_tz(dst_tz);
-    PADDLE_ENFORCE(output->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Currently only supports NC data format
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index fefc7125b4de7274589670d29be4511469d5064a..bb081238820b9ee3ae095442d21cfce11f7b41e5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SoftmaxOp should not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2UL,
-                   "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
+             "The input tensor of softmax, "
+             "whose last dimension is the input_feature_dimensions.");
     AddOutput("Out", "The normalized values with the same shape as X.")
         .Reuse("X");
     AddAttr<bool>(
@@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Softmax Operator.
 
-The input of the softmax operator is a 2-D tensor with shape N x K (N is the
-batch_size, K is the dimension of input feature). The output tensor has the
-same shape as the input tensor.
+The input of the softmax operator is a tensor of any rank. The output tensor 
+has the same shape as the input.
 
-For each row of the input tensor, the softmax operator squashes the
-K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1.
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+second dimension(row length) is as same as the last dimension of the input 
+tensor, and the first dimension(column length) is the product of all other 
+dimensions of the input tensor. For each row of the matrix, the softmax operator 
+squashes the K-dimensional(K is the width of the matrix, which is also the size 
+of the input tensor's last dimension) vector of arbitrary real values to a 
+K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.
 Then the ratio of the exponential of the given dimension and the sum of
 exponential values of all the other dimensions is the output of the softmax
 operator.
 
-For each row $i$ and each column $j$ in Input(X), we have:
+For each row $i$ and each column $j$ in the matrix, we have:
     $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 600da45a0bbb69b76d59c981e195fc03a49b0504..1205bd0587f32caae04c27ecea581fc17988507f 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Out);
+        context.template device_context<DeviceContext>(), &flattened_x,
+        &flattened_out);
   }
 };
 
@@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Out, dOut, dX);
+        context.template device_context<DeviceContext>(), &flattened_out,
+        &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8f7840cee1dd95a828fd4ac8815e335a5db47e3d..a559b01ed32a48e3befb37c2ae8935b4f3a4acb0 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 
 namespace paddle {
@@ -53,8 +55,196 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
     logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
   }
 }
+
 }  // namespace
 
+static __device__ __forceinline__ float real_exp(float x) { return expf(x); }
+static __device__ __forceinline__ double real_exp(double x) { return exp(x); }
+static __device__ __forceinline__ float real_log(float x) {
+  return math::TolerableValue<float>()(logf(x));
+}
+static __device__ __forceinline__ double real_log(double x) {
+  return math::TolerableValue<double>()(log(x));
+}
+
+/** In the following codes, 3 CUDA kernels are implemented to calculate softmax
+ * and loss **/
+/*
+  Supposing the x is `logits` and y is `labels`, the equations are as
+followings:
+
+  cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
+        = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
+        = \sum_{j}(-y_i_j * tmp_i_j)
+
+  softmax_i_j = e^{tmp_i_j}
+
+where:
+  max_i = \max_{j}{x_i_j}
+  logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
+  tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
+
+Therefore, the calculation can be separated into 3 steps:
+Step 1: row-wise operation to calculate max_i
+Step 2: row-wise operation to calculate logDiffMaxSum_i
+Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
+
+To save memory, we can share memory among max_i, logDiffMaxSum_i and
+cross\_entropy_i.
+In this way, the 3 steps should be changed to:
+Step 1 (RowReductionForMax): row-wise operation to calculate max_i
+Step 2 (RowReductionForDiffMaxSum): calculate immediate result of softmax'_i_j =
+x_i_j - max_i, and row-wise operation to calculate logDiffMaxSum_i
+Step 3 (RowReductionForSoftmaxAndCrossEntropy): calculate tmp_i_j = softmax'_i_j
+- logDiffMaxSum_i, and finally get softmax_i_j and cross\_entropy_i
+*/
+
+// There are 3 kinds of reduce algorithms in cub:
+// BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
+// BLOCK_REDUCE_RAKING
+// BLOCK_REDUCE_WARP_REDUCTIONS (default)
+template <typename T, int BlockDim>
+using BlockReduce =
+    cub::BlockReduce<T, BlockDim /*, cub::BLOCK_REDUCE_WARP_REDUCTIONS*/>;
+
+template <typename T, int BlockDim>
+using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
+
+// Make sure that BlockDim <= feature_size
+// This kernel is used to calculate the max element of each row
+template <typename T, int BlockDim>
+__global__ void RowReductionForMax(const T* logits_data, T* max_data,
+                                   int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  T cur_max = logits_data[beg_idx];
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    if (cur_max < logits_data[beg_idx]) {
+      cur_max = logits_data[beg_idx];
+    }
+    beg_idx += BlockDim;
+  }
+
+  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
+
+  if (threadIdx.x == 0) {
+    max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max;
+  }
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
+                                          T* softmax, int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  auto block_max = max_data[blockIdx.x];
+
+  softmax[beg_idx] = logits_data[beg_idx] - block_max;
+  T diff_max_sum = real_exp(softmax[beg_idx]);
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    softmax[beg_idx] = logits_data[beg_idx] - block_max;
+    diff_max_sum += real_exp(softmax[beg_idx]);
+    beg_idx += BlockDim;
+  }
+
+  diff_max_sum =
+      BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
+  if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
+                                                      const T* labels_data,
+                                                      T* loss_data, T* softmax,
+                                                      int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  // log_diff_max_sum shares memory with loss
+  auto block_log_diff_max_sum = loss_data[blockIdx.x];
+  auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
+  softmax[beg_idx] = real_exp(tmp);
+  auto loss = -labels_data[beg_idx] * tmp;
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    tmp = softmax[beg_idx] - block_log_diff_max_sum;
+    softmax[beg_idx] = real_exp(tmp);
+    loss -= (labels_data[beg_idx] * tmp);
+    beg_idx += BlockDim;
+  }
+
+  loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
+  if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
+}
+
+template <typename T>
+__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
+  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < batch_size) out[idx] = static_cast<T>(1);
+}
+
+template <typename T>
+static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
+                                               const T* labels_data,
+                                               T* softmax_data, T* loss_data,
+                                               int batch_size, int feature_size,
+                                               cudaStream_t stream) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = feature_size >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(feature_size)));
+
+#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
+  case BlockDim:                                                              \
+    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(     \
+        logits_data, loss_data, feature_size);                                \
+    RowReductionForDiffMaxSum<T,                                              \
+                              BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
+        logits_data, loss_data, softmax_data, feature_size);                  \
+    RowReductionForSoftmaxAndCrossEntropy<                                    \
+        T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(                    \
+        logits_data, labels_data, loss_data, softmax_data, feature_size);     \
+    break
+
+  switch (block_dim) {
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    case 1:
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+                                                kMaxBlockDim,
+                                            kMaxBlockDim, 0, stream>>>(
+          softmax_data, batch_size);
+      cudaMemsetAsync(loss_data, 0, batch_size, stream);
+      break;
+    default:
+      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      break;
+  }
+
+#undef CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -66,14 +256,24 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     Tensor* softmax = context.Output<Tensor>("Softmax");
 
     Tensor* loss = context.Output<Tensor>("Loss");
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), logits, softmax);
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
+    auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
+    auto* loss_data = loss->mutable_data<T>(context.GetPlace());
+
+    auto soft_label = context.Attr<bool>("soft_label");
+    if (soft_label) {
+      int batch_size = logits->dims()[0];
+      int feature_size = logits->dims()[1];
+      auto* logits_data = logits->data<T>();
+      auto* labels_data = labels->data<T>();
+      SoftmaxWithCrossEntropyFusedKernel(
+          logits_data, labels_data, softmax_data, loss_data, batch_size,
+          feature_size, context.cuda_device_context().stream());
+    } else {
+      math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
+                                     softmax);
+      math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+          context.cuda_device_context(), loss, softmax, labels, false);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 1172822e12222ded219104e3bad2613d30f891b8..ee3078876c15b06a887064f08dc0c05d450b5f77 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
                     "TensorRT' tensor input requires at most 4 dimensions");
-
-  switch (shape.size()) {
-    case 2:
-      return nvinfer1::Dims2(1, shape[1]);
-    case 3:
-      return nvinfer1::Dims3(1, shape[1], shape[2]);
-    case 4:
-      return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
-    default:
-      return nvinfer1::Dims();
-  }
-  return nvinfer1::Dims();
+  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
+  return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
 }
 
 }  // namespace
@@ -86,6 +76,9 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
     parameters.insert(param);
   }
 
+  std::vector<std::string> output_maps =
+      context.Attr<std::vector<std::string>>("output_name_mapping");
+
   // TODO(Superjomn) replace this with a different stream
   auto *engine = Singleton<TRT_EngineManager>::Global().Create(
       max_batch, max_workspace, nullptr /*engine hold its own stream*/,
@@ -97,6 +90,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
   // Add inputs
   VLOG(4) << "declare inputs";
   for (auto &input : context.Inputs("Xs")) {
+    if (parameters.count(input)) continue;
     VLOG(4) << "declare input " << input;
     auto *var = block.FindVar(input);
     // TensorRT engine need to create parameters. The parameter's description
@@ -122,7 +116,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
       block_desc, parameters, context.scope(), engine);
 
   // Add outputs
-  for (auto &output : context.Outputs("Ys")) {
+  for (auto &output : output_maps) {
     engine->DeclareOutput(output);
   }
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 32d10fd8a5687ebaae1d7d75af531cbc45ef4245..2cbe1213a2f428a3ce56b06f97636baeb4b66c26 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
                       context.Attr<int>("max_batch"));
 
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
+
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
+      if (parameters.count(x)) continue;
       // convert input and copy to TRT engine's buffer
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
@@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     // Execute the engine.
     PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
     engine->Execute(FLAGS_tensorrt_engine_batch_size);
+
     // Convert output tensor from engine to fluid
+    int output_index = 0;
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // TODO(Superjomn) change this float to dtype size.
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
                   FLAGS_tensorrt_engine_batch_size;
-      engine->GetOutputInCPU(y,
+      engine->GetOutputInCPU(output_maps[output_index],
                              fluid_t->mutable_data<float>(platform::CPUPlace()),
                              size * sizeof(float));
       //} else {
@@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
       // size * sizeof(float));
       //}
+      output_index += 1;
     }
 
     cudaStreamSynchronize(*engine->stream());
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 7cb1e47a1516c32fb31a7818e7203b498e31e431..37657fa0b0498986fe67027415279af1775e58b9 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -103,6 +103,9 @@ TEST(TensorRTEngineOp, manual) {
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z0"}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
@@ -196,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
 
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z3"}));
+
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
 
   // Execute them.
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index a6f68f8b0c0a9b07c326888e30c0c911e7861607..f08c0e8e345179bf198ca9d50278b7f65e03ca2c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -18,7 +18,11 @@ else()
 endif()
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
-cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
+set(CPU_INFO_DEPS gflags glog enforce)
+IF(WITH_XBYAK)
+    list(APPEND CPU_INFO_DEPS xbyak)
+ENDIF()
+cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index f832d72b53e8d06a32d5c0ac2ecf7130aa28a666..9280965af29d0f5635c015846ed65746ee3dc669 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
 
 #ifdef __APPLE__
 #include <sys/sysctl.h>
@@ -98,5 +100,39 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
+#ifdef PADDLE_WITH_XBYAK
+namespace jit {
+
+static Xbyak::util::Cpu cpu;
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  using namespace Xbyak::util;  // NOLINT
+  switch (cpu_isa) {
+    case sse42:
+      return cpu.has(Cpu::tSSE42);
+    case avx2:
+      return cpu.has(Cpu::tAVX2);
+    case avx512_common:
+      return cpu.has(Cpu::tAVX512F);
+    case avx512_core:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
+    case avx512_core_vnni:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
+             cpu.has(Cpu::tAVX512_VNNI);
+    case avx512_mic:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) &&
+             cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF);
+    case avx512_mic_4ops:
+      return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
+             cpu.has(Cpu::tAVX512_4VNNIW);
+    case isa_any:
+      return true;
+  }
+  return false;
+}
+
+}  // namespace jit
+#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index f06c2b67fe4385f427322e9bb2f3080fdd3acc94..f5f67667594f1ab80058533e4c5d5b04c2592b60 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -37,5 +37,25 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
+#ifdef PADDLE_WITH_XBYAK
+namespace jit {
+
+typedef enum {
+  isa_any,
+  sse42,
+  avx2,
+  avx512_common,
+  avx512_core,
+  avx512_core_vnni,
+  avx512_mic,
+  avx512_mic_4ops,
+} cpu_isa_t;  // Instruction set architecture
+
+// May I use some instruction
+inline bool MayIUse(const cpu_isa_t cpu_isa);
+
+}  // namespace jit
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d9e2afadaf8ec439d158e57c94d3e6e684bce116..8fa8dbd67c936439840cffa073b6fa6693dd31a1 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -30,9 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace {
-// Current thread's id. Note, we don't distinguish nested threads
-// for now.
-thread_local int cur_thread_id = 0;
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
-
-void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
-
-void ClearCurThread() { cur_thread_id = 0; }
-
-int CurThread() { return cur_thread_id; }
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..d2a571f4345b544ad5e74f4629c3967593d6d628 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -99,9 +99,5 @@ std::string CurAnnotation();
 void SetCurBlock(int block_id);
 void ClearCurBlock();
 int BlockDepth();
-
-void SetCurThread(int thread_id);
-void ClearCurThread();
-int CurThread();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index a8f93e6848a1db1f5aa0ee266a076af2b5d0c964..10a3ad256b17ba41380cdc0377905d03188cbaa3 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -223,7 +223,7 @@ class MKLDNNHandler {
   static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
@@ -251,5 +251,17 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
   return data_format;
 }
 
+inline mkldnn::memory::format data_format_to_memory_format(
+    const std::string& data_format) {
+  switch (framework::StringToDataLayout(data_format)) {
+    case framework::DataLayout::kNHWC:
+      return mkldnn::memory::format::nhwc;
+    case framework::DataLayout::kNCHW:
+      return mkldnn::memory::format::nchw;
+    default:
+      return mkldnn::memory::format::any;
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 01de9d7041bf3eb40884e2a6295027cccfaebd2a..7c8d8a5964fa5258bebaf2c8522886ae5886ab2c 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -110,6 +110,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id,
   has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
   if (has_cuda_) {
     auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaSetDevice(
+        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
     PADDLE_ENFORCE(cudaGetDevice(&device_));
     PADDLE_ENFORCE(cudaEventCreate(&event_));
     auto stream = cuda_dev_ctx->stream();
@@ -176,6 +178,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   dev_ctx_ = dev_ctx;
@@ -186,11 +189,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
-                          BlockDepth(), CurThread());
+                          BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
@@ -198,6 +202,7 @@ RecordEvent::~RecordEvent() {
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   SetCurBlock(block_id);
@@ -205,27 +210,18 @@ RecordBlock::RecordBlock(int block_id)
 }
 
 RecordBlock::~RecordBlock() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
     tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          CurThread());
+                          g_thread_id);
   }
   ClearCurBlock();
 }
 
-RecordThread::RecordThread(int thread_id) {
-  if (g_state == ProfilerState::kDisabled) return;
-  SetCurThread(thread_id);
-}
-
-RecordThread::~RecordThread() {
-  if (g_state == ProfilerState::kDisabled) return;
-  ClearCurThread();
-}
-
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index bf43925373a12cd9ff2155d68c42d0266ba4df60..c99d9c807d1bfb45d1ce0725b84b9fff09049511 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -95,11 +95,6 @@ struct RecordBlock {
   uint64_t start_ns_;
 };
 
-struct RecordThread {
-  explicit RecordThread(int thread_id);
-  ~RecordThread();
-};
-
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a8bc16f1b5b9b624e88e355d8ce4741fcec34bc3..8460f93b841fe136db138e0dc7576f3aacdbeb5f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -419,6 +419,25 @@ EOF
     linkchecker doc/v2/en/html/index.html
     linkchecker doc/v2/cn/html/index.html
     linkchecker doc/v2/api/en/html/index.html
+
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+
+    # Deploy to the the content server if its a "develop" or "release/version" branch
+    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+        PPO_SCRIPT_BRANCH=develop
+    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+        PPO_SCRIPT_BRANCH=master
+    else
+        # Early exit, this branch doesn't require documentation build
+        return 0;
+    fi
+     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
+    cd -
 }
 
 function gen_html() {
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 3462deb9c2f88b6da643d6aa833449ed5f4a9b34..174c2a12f007b282a5182c0aec9b0a6bec9e55fa 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -52,6 +52,9 @@ EOL
     ${DOCKER_CMD} run -it \
         ${DOCKER_ENV} \
         -e SCRIPT_NAME=$0 \
+        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
         -v $PADDLE_ROOT:/paddle \
         -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index b5a87d624d3b13a29dff82e86a89988d4a8579db..e5a5b3a343abe4251bcc1f70ec34269cfd9a2c4b 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -31,13 +31,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index 75363a30f36fd107ec70e2aa390adeed5ea1c7c2..aaae4b3588ae79cad48804ebe317ca1e6eecc3b8 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -42,8 +42,8 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
              'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz'
-MD5_MODEL = '0791583d57d5beb693b9414c5b36798c'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
 END = "<e>"
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7b61a1e6a27b8614d096a51af41a0bdcc3a0aaf3..2ab176ec272a9f250d5229ae67faf350e5eda72c 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -123,7 +123,8 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads'
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
+        'cpu_deterministic'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index f51acdac6e0b2b7bd0908b85fb86d46b144adc12..7ff77162aacfd37ac7d3aae45d9bc1180e84aacb 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -590,8 +590,6 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     program.current_block_idx = current_block_idx
     program._sync_with_cpp()
-    # FIXME(zcd): prevent loss.grad optimized by mem_opt.
-    loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
 
     if parameter_list is not None:
         parameters = parameter_list
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index a183543d0793c4a54ee5c1aed2d8c0f86b85375b..58f2da1c3ba2f84602e7a18c7b1c78d1f0d2ede1 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -14,5 +14,7 @@
 
 from . import decoder
 from .decoder import *
+from . import memory_usage_calc
+from .memory_usage_calc import *
 
-__all__ = decoder.__all__
+__all__ = decoder.__all__ + memory_usage_calc.__all__
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da846edb63c28efd791fdfac4046cfa56c24181
--- /dev/null
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module privides a memory usage calculate function for user.
+The purpose of this API is to allow users to estimate memory usage of
+a program under a special batch size, then user can set appropriate 
+batch size to fully utilize a GPU. 
+
+This API is still under active development and may change drastically.
+"""
+
+from .. import core
+from ..framework import Program, Variable
+
+__all__ = ['memory_usage']
+
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+DEBUG = False
+
+
+def memory_usage(program, batch_size):
+    """
+    Get the estimate memory usage of program with input batch size.
+
+    Args:
+        program(Program): The current Program.
+        batch_size(int): The current input data batch_size.  
+    
+    Returns:
+        min_total_memory(float): the estimate memory usage lower bound.
+        max_total_memory(float): the estimate memory usage upper bound.
+        unit_str(string): the unit of estimate usage result.
+    
+    Examples:
+        
+        >>> import paddle.fluid as fluid
+        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+                fluid.default_main_program(), batch_size=10)
+        >>> print "memory usage is about %.3f - %.3f %s" % \
+                (lower_usage, upper_usage, unit)
+
+    """
+
+    # Parameters check
+    if not isinstance(program, Program):
+        raise TypeError(
+            "Calculating Memory Usage requires Program as its Parameter."
+            "But you passed in %s" % (type(prgram)))
+    if batch_size <= 0:
+        raise ValueError("The batch size need to be positive.")
+
+    # Get the var_name list of first block and calculate
+    total_memory = 0.0
+    for var in program.global_block().vars.itervalues():
+        data_count = 1
+        for x in var.shape:
+            if x == -1:
+                data_count *= batch_size
+            else:
+                data_count *= x
+        var_memory = data_count * dtype_to_size[var.dtype]
+        if DEBUG:
+            print "%s memory usage: %d" % (var.name, var_memory)
+        total_memory += var_memory
+    if DEBUG:
+        print "total memory usage: %.2f" % (total_memory)
+
+    # Convert appropriate unit
+    unit_str = "B"
+    if total_memory > 1024:
+        total_memory /= 1024
+        unit_str = "KB"
+        if total_memory > 1024:
+            total_memory /= 1024
+            unit_str = "MB"
+
+    # Append extra memory consumption (5% - 10%)
+    min_total_memory = total_memory * 1.05
+    max_total_memory = total_memory * 1.1
+
+    return min_total_memory, max_total_memory, unit_str
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 304df414f6820e7ace70886a29fe3c612e3cdd1b..073172ddbaffe8a2487e226446b330e6b6726920 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1047,7 +1047,26 @@ class Block(object):
         global_block = self.program.global_block()
         param = Parameter(global_block, *args, **kwargs)
         if 'initializer' in kwargs:
-            kwargs['initializer'](param, self)
+
+            def _is_inited_by(block, var):
+                init_ops = []
+                for op in block.ops:
+                    if var.name in op.output_arg_names:
+                        init_ops.append(op)
+                return init_ops
+
+            initializer = kwargs['initializer']
+            init_ops = _is_inited_by(global_block, param)
+            init_ops_len = len(init_ops)
+            if init_ops_len > 1:
+                raise RuntimeError("param " + param.name +
+                                   " is inited by multiple init ops " + str(
+                                       init_ops))
+            elif init_ops_len == 1:
+                #TODO already inited, do nothing, should log a warning
+                pass
+            else:
+                initializer(param, self)
         return param
 
     def append_op(self, *args, **kwargs):
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 5f66f54cf743c5fb8ec939d6ef29c3c61428be48..82df4c6c54d77f75e045fda0c0a340fb0daab646 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -401,6 +401,8 @@ class LayerHelper(object):
             return input_var
         if isinstance(act, six.string_types):
             act = {'type': act}
+        else:
+            raise TypeError(str(act) + " should be unicode or str")
 
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 37e860b08cea40f9e44f1e70e41f1ab14562dfeb..62e0f3876c6f64696bdbbe59776a91fa2d6405f4 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -950,6 +950,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     helper = LayerHelper('dropout', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
     mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
     helper.append_op(
         type='dropout',
         inputs={'X': [x]},
@@ -1314,13 +1318,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     """
-    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
-    batch_size, K is the dimension of input feature). The output tensor has the
-    same shape as the input tensor.
+    The input of the softmax operator is a tensor of any rank. The output tensor 
+    has the same shape as the input.
 
-    For each row of the input tensor, the softmax operator squashes the
-    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-    values in the range [0, 1] that add up to 1.
+    The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+    second dimension(row length) is as same as the last dimension of the input 
+    tensor, and the first dimension(column length) is the product of all other 
+    dimensions of the input tensor. For each row of the matrix, the softmax operator 
+    squashes the K-dimensional(K is the width of the matrix, which is also the size 
+    of the input tensor's last dimension) vector of arbitrary real values to a 
+    K-dimensional vector of real values in the range [0, 1] that add up to 1.
 
     It computes the exponential of the given dimension and the sum of exponential
     values of all the other dimensions in the K-dimensional vector input.
@@ -1328,7 +1335,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     exponential values of all the other dimensions is the output of the softmax
     operator.
 
-    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
 
     .. math::
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c8e881a672ad25654bd28604abfafc2c569af7ca..a6a911721dfa31e5fb8d57645071af42adc968be 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -50,6 +50,8 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
+list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -64,3 +66,5 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
+py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index fc7422f12b3fd1c4b9c30f32fdfbc6ac0bd721c2..e53e1e6dc12fc40684e9827f8dddecc0d834a064 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import argparse
+import six
 import time
 import math
 
@@ -174,6 +175,9 @@ class SE_ResNeXt():
             padding=(filter_size - 1) // 2,
             groups=groups,
             act=None,
+            # avoid pserver CPU init differs from GPU
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant()),
             bias_attr=False)
         return fluid.layers.batch_norm(input=conv, act=act)
 
@@ -194,10 +198,8 @@ class SE_ResNeXt():
 
 def get_model(batch_size):
     # Input data
-    image = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+    image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32')
+    label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
 
     # Train program
     model = SE_ResNeXt(layers=50)
@@ -222,8 +224,10 @@ def get_model(batch_size):
     lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
 
     optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=lr),
+        # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
+        #learning_rate=fluid.layers.piecewise_decay(
+        #    boundaries=bd, values=lr),
+        learning_rate=base_lr,
         momentum=0.9,
         regularization=fluid.regularizer.L2Decay(1e-4))
     optimizer.minimize(avg_cost)
@@ -232,7 +236,7 @@ def get_model(batch_size):
     train_reader = paddle.batch(
         paddle.dataset.flowers.train(), batch_size=batch_size)
     test_reader = paddle.batch(
-        paddle.dataset.flowers.test(), batch_size=batch_size)
+        paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
 
     return test_program, avg_cost, train_reader, test_reader, acc_top1, out
 
@@ -256,7 +260,6 @@ class DistSeResneXt2x2:
                            trainers)
         pserver_prog = t.get_pserver_program(current_endpoint)
         startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
@@ -302,12 +305,19 @@ class DistSeResneXt2x2:
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = train_reader()
-        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        reader_generator = test_reader()
+
+        data = next(reader_generator)
+        first_loss, = exe.run(fetch_list=[avg_cost.name],
+                              feed=feeder.feed(data))
         print(first_loss)
-        for i in range(5):
-            loss, = exe.run(fetch_list=[avg_cost.name])
-        last_loss, = exe.run(fetch_list=[avg_cost.name])
+
+        for i in six.moves.xrange(5):
+            data = next(reader_generator)
+            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+
+        data = next(reader_generator)
+        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
         print(last_loss)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 5ed387fb1247f1a91147cb6981f1adc7c2eeb8a2..34f9cf0620fd1351111e93e16ed5f7e765d7078b 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -313,9 +313,9 @@ class TestAbs(OpTest):
         self.init_dtype()
 
         x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        # Because we set delta = 0.005 in caculating numeric gradient,
+        # Because we set delta = 0.005 in calculating numeric gradient,
         # if x is too small, such as 0.002, x_neg will be -0.003
-        # x_pos will be 0.007, so the numeric gradient is unaccurate.
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
         out = np.abs(x)
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
index db6be21baaa54d33af9f5c44d1815e4b389eb884..d0de7ad52c8a851c16cbbbf544d479f696dee136 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -20,16 +20,19 @@ from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
 class TestMKLDNN(TestConv2dOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithPad(TestWithPad):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithStride(TestWithStride):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index bdfa17ebc959f731799332ac04ddacc1ffc8d126..6467e302a5111cc95a5980c2fce97e9d6ec6476b 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -66,6 +66,7 @@ class TestConv2dOp(OpTest):
         self.op_type = "conv2d"
         self.use_cudnn = False
         self.use_mkldnn = False
+        self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
         self.init_group()
@@ -93,7 +94,8 @@ class TestConv2dOp(OpTest):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         self.outputs = {'Output': output}
 
@@ -101,59 +103,35 @@ class TestConv2dOp(OpTest):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.02)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index f543a39d83b0c33c4622a6ecec2d31d11601cad2..39b304c714428173b3c15cf8a0109330b612a45c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -66,7 +66,8 @@ class TestDistBase(unittest.TestCase):
             "PATH": os.getenv("PATH"),
             "PYTHONPATH": os.getenv("PYTHONPATH"),
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1"
         }
         # Run local to get a base line
         env_local = {"CUDA_VISIBLE_DEVICES": "0"}
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index 04671d079731ce414561b0ede6bc2b195b07d82a..f3a5fd6985bab1d04f6e1484534367548f383dfb 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -17,8 +17,7 @@ from test_dist_base import TestDistBase
 
 class TestDistSeResneXt2x2(TestDistBase):
     def test_se_resnext(self):
-        # TODO(paddle-dev): Is the delta too large?
-        self.check_with_place("dist_se_resnext.py", delta=0.2)
+        self.check_with_place("dist_se_resnext.py")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index a42a9718fdf5d3905a13274f20f96bc571b29b56..c16ac22523a586bf24df8f8e8921210862e06cb9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -52,17 +52,17 @@ class TranspilerTest(unittest.TestCase):
         self.origin_prog = main.clone()
         return main
 
-    def get_trainer(self, config=None):
-        t = self._transpiler_instance(config)
+    def get_trainer(self, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         return t.get_trainer_program()
 
-    def get_pserver(self, ep, config=None):
-        t = self._transpiler_instance(config)
+    def get_pserver(self, ep, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         pserver = t.get_pserver_program(ep)
         startup = t.get_startup_program(ep, pserver)
         return pserver, startup
 
-    def _transpiler_instance(self, config=None):
+    def _transpiler_instance(self, config=None, sync_mode=True):
         if not self.transpiler:
             main = self.get_main_program()
             self.transpiler = fluid.DistributeTranspiler(config=config)
@@ -70,13 +70,23 @@ class TranspilerTest(unittest.TestCase):
                 self.trainer_id,
                 program=main,
                 pservers=self.pserver_eps,
-                trainers=self.trainers)
+                trainers=self.trainers,
+                sync_mode=sync_mode)
 
         return self.transpiler
 
+    def transpiler_test_impl(self):
+        pass
 
-class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            self.transpiler_test_impl()
+
+
+class TestBasicModel(TranspilerTest):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep)
 
@@ -124,7 +134,7 @@ class TestBasicModel(TranspilerTest):
 
 
 class TestBasicModelWithLargeBlockSize(TranspilerTest):
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
         config.min_block_size = 1048576
 
@@ -149,7 +159,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
                          ["sum", "scale", "sgd"])
         # confirm startup program
         self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "fill_constant", "fill_constant"])
+                         ["fill_constant", "fill_constant"])
         # the variable #fc_w will be split into two blocks
         fc_w_var = startup2.global_block().var("fc_w")
         self.assertEqual(fc_w_var.shape, (1000, 1000))
@@ -178,7 +188,7 @@ class TestNoSliceVar(TranspilerTest):
     def setUp(self):
         super(TestNoSliceVar, self).setUp()
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
         config.slice_var_up = False
 
@@ -213,7 +223,7 @@ class TestLRDecay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -243,7 +253,7 @@ class TestLRDecayConditional(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -292,7 +302,7 @@ class TestL2Decay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -327,7 +337,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -351,5 +361,181 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             ["sum", "scale", "scale", "elementwise_add", "momentum"])
 
 
+class TestDistLookupTableBase(TranspilerTest):
+    def network_with_table(self, is_sparse, is_distributed):
+        def emb_pool(ids):
+            table_size = 1000
+            emb_size = 64
+            emb = fluid.layers.embedding(
+                input=ids,
+                size=[table_size, emb_size],
+                dtype='float32',
+                param_attr='shared_w',  # share parameter
+                is_sparse=is_sparse,
+                is_distributed=is_distributed)
+            pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
+            return pool
+
+        title_ids = fluid.layers.data(
+            name='title_ids', shape=[1], dtype='int64', lod_level=1)
+        brand_ids = fluid.layers.data(
+            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_emb = emb_pool(title_ids)
+        brand_emb = emb_pool(brand_ids)
+        fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
+        predict = fluid.layers.fc(input=fc0,
+                                  size=2,
+                                  act=None,
+                                  param_attr=fluid.ParamAttr(name='fc_w'),
+                                  bias_attr=fluid.ParamAttr(name='fc_b'))
+
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+
+class TestLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "adam", "scale", "scale"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
+            'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
+            'fetch_barrier'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["adam", "scale", "scale"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
+            'recv', 'recv', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'recv', 'recv'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0a939e9ec21952a6657ea849bb9844bb69cc8d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -0,0 +1,818 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+# scale + add
+#   TestElementwiseAddOp
+#   TestFusedOperatorsOp_scalar
+#   TestFusedOperatorsOp_scalar2
+#   TestFusedOperatorsOp_Vector
+#   TestFusedOperatorsOp_broadcast_0
+#   TestFusedOperatorsOp_broadcast_1
+#   TestFusedOperatorsOp_broadcast_2
+#   TestFusedOperatorsOp_broadcast_3
+#   TestFusedOperatorsOp_broadcast_4
+#   TestFusedOperatorsOp_rowwise_add_0
+#   TestFusedOperatorsOp_rowwise_add_1
+#   TestFusedOperatorsOp_channelwise_add
+
+
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_elemwise_activation"
+        self.dtype = np.float32
+        self.axis = -1
+
+        self.init_axis()
+        self.init_dtype()
+        self.init_input()
+        self.init_output()
+        self.init_attr()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+
+    def init_input(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["scale", "elementwise_add"]
+        }
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestFusedOperatorsOp_scalar(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_scalar2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_Vector(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_3(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_4(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_channelwise_add(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+# add + scale
+#   TestElementwiseAddOp_f_add_scale
+#   TestFusedOperatorsOp_scalar_f_add_scale
+#   TestFusedOperatorsOp_scalar2_f_add_scale
+#   TestFusedOperatorsOp_Vector_f_add_scale
+#   TestFusedOperatorsOp_broadcast_0_f_add_scale
+#   TestFusedOperatorsOp_broadcast_1_f_add_scale
+#   TestFusedOperatorsOp_broadcast_2_f_add_scale
+#   TestFusedOperatorsOp_broadcast_3_f_add_scale
+#   TestFusedOperatorsOp_broadcast_4_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_scale
+#   TestFusedOperatorsOp_channelwise_add_f_add_scale
+
+
+class TestFusedOperatorsOp_f_add_scale(TestElementwiseAddOp):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_scale(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_scale(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_scale(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_scale(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(2, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_scale(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_scale(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 1, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_scale(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_scale(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(2, 1, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_scale(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+# add + relu
+#   TestElementwiseAddOp_f_add_relu
+#   TestFusedOperatorsOp_scalar_f_add_relu
+#   TestFusedOperatorsOp_scalar2_f_add_relu
+#   TestFusedOperatorsOp_Vector_f_add_relu
+#   TestFusedOperatorsOp_broadcast_0_f_add_relu
+#   TestFusedOperatorsOp_broadcast_1_f_add_relu
+#   TestFusedOperatorsOp_broadcast_2_f_add_relu
+#   TestFusedOperatorsOp_broadcast_3_f_add_relu
+#   TestFusedOperatorsOp_broadcast_4_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_relu
+#   TestFusedOperatorsOp_channelwise_add_f_add_relu
+
+
+class TestFusedOperatorsOp_f_add_relu(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_relu(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_relu(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_relu(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_relu(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_relu(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_relu(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_relu(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_relu(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_relu(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+# relu + add
+#   TestElementwiseAddOp_f_relu_add
+#   TestFusedOperatorsOp_scalar_f_relu_add
+#   TestFusedOperatorsOp_scalar2_f_relu_add
+#   TestFusedOperatorsOp_Vector_f_relu_add
+#   TestFusedOperatorsOp_broadcast_0_f_relu_add
+#   TestFusedOperatorsOp_broadcast_1_f_relu_add
+#   TestFusedOperatorsOp_broadcast_2_f_relu_add
+#   TestFusedOperatorsOp_broadcast_3_f_relu_add
+#   TestFusedOperatorsOp_broadcast_4_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_0_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_1_f_relu_add
+#   TestFusedOperatorsOp_channelwise_add_f_relu_add
+
+
+class TestFusedOperatorsOp_f_relu_add(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_relu_add(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_relu_add(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_relu_add(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_relu_add(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_relu_add(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_relu_add(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_relu_add(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_relu_add(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_relu_add(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 15a72cb605911dfe957fb927763174521a30a085..b215e379864e919af03591ab2566c08dddbb5743 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -27,12 +27,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -43,12 +44,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer(2.3))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer(2.3))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -61,12 +63,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -80,18 +83,19 @@ class TestUniformInitializer(unittest.TestCase):
         program = framework.Program()
         program.random_seed = 123
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(seed=456))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param1",
+                initializer=initializer.UniformInitializer())
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param2",
+                initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
         self.assertEqual(init_op.attr("seed"), 123)
         init_op1 = block.ops[0]
@@ -102,12 +106,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -115,6 +120,25 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 123)
 
+    def test_uniform_initializer_two_op(self):
+        """Test uniform initializer with supplied attributes
+        """
+        program = framework.Program()
+        block = program.global_block()
+        for i in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, float(i), 123))
+        self.assertEqual(len(block.ops), 1)
+        init_op0 = block.ops[0]
+        self.assertEqual(init_op0.type, 'uniform_random')
+        self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
+        self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
+        self.assertEqual(init_op0.attr('seed'), 123)
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -122,12 +146,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -140,12 +165,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer(2.3, 1.9, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer(2.3, 1.9, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -161,12 +187,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -181,12 +208,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -203,12 +231,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -223,12 +252,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -244,13 +274,14 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(
-                fan_in=12, fan_out=23, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(
+                    fan_in=12, fan_out=23, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -267,12 +298,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -287,12 +319,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -308,12 +341,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -328,12 +362,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -348,13 +383,14 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(
-                fan_in=12, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(
+                    fan_in=12, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -370,12 +406,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[8, 1, 3, 3],
-            lod_level=0,
-            name="param",
-            initializer=initializer.BilinearInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[8, 1, 3, 3],
+                lod_level=0,
+                name="param",
+                initializer=initializer.BilinearInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index fcf3a0d920d8a3b903b64cecccd1915566843e2b..ac25f432dffd544d4b336983ec868f2431a5b91a 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -35,6 +35,22 @@ class TestLookupTableOp(OpTest):
         self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
+class TestLookupTableOpWithTensorIds(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.random((17, 31)).astype("float32")
+        ids = np.random.randint(
+            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+
+
 class TestLookupTableOpWithPadding(TestLookupTableOp):
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
@@ -44,21 +60,34 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
         self.check_output()
 
     def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of 
+        # Since paddings are not trainable and fixed in forward, the gradient of
         # paddings makes no sense and we don't test the gradient here.
         pass
 
 
-class TestLookupTableWIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
+class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': long(padding_idx)}
+        self.check_output()
+
+    def test_check_grad(self):
+        # Since paddings are not trainable and fixed in forward, the gradient of
+        # paddings makes no sense and we don't test the gradient here.
+        pass
+
 
-        # create and initialize Id Variable
+class TestLookupTableWIsSelectedRows(OpTest):
+    def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
         ids_tensor.set(ids_array, place)
+        return ids_array
 
-        # create and initialize W Variable
+    def prepare_w(self, scope, place):
         rows = [0, 1, 2, 3, 4, 5, 6]
         row_numel = 12
 
@@ -71,8 +100,22 @@ class TestLookupTableWIsSelectedRows(OpTest):
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
-        # create Out Variable
-        out_tensor = scope.var('Out').get_tensor()
+    def create_out_tensor(self, scope, place):
+        return scope.var('Out').get_tensor()
+
+    def check_result(self, ids_array, result_array):
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        ids_array = self.prepare_ids(scope, place)
+
+        self.prepare_w(scope, place)
+
+        out_tensor = self.create_out_tensor(scope, place)
 
         # create and run lookup_table operator
         lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
@@ -80,9 +123,8 @@ class TestLookupTableWIsSelectedRows(OpTest):
 
         # get result from Out
         result_array = np.array(out_tensor)
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
+
+        self.check_result(ids_array, result_array)
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
@@ -91,5 +133,19 @@ class TestLookupTableWIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWithTensorIdsWIsSelectedRows(
+        TestLookupTableWIsSelectedRows):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.random.randint(
+            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in np.ndenumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9daf83652e18faab0ab31402b9f5889a0beceaf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+
+
+def train_simulator(test_batch_size=10):
+    if test_batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(test_batch_size))
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Calculate memory usage in current network config 
+    lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+        fluid.default_main_program(), batch_size=test_batch_size)
+
+    print("memory usage is about %.3f - %.3f %s" %
+          (lower_usage, upper_usage, unit))
+
+
+class TestMemoryUsage(unittest.TestCase):
+    def test_with_unit_B(self):
+        with self.program_scope_guard():
+            train_simulator()
+
+    def test_with_unit_KB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=1000)
+
+    def test_with_unit_MB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=100000)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index c5cbfcf92d4ae5bfeccbe3210d348ee36daab82c..9448d89cd58f4e5cff4bac49821fbc44c5a46246 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -98,16 +98,13 @@ class TestMNIST(TestParallelExecutorBase):
             fluid.recordio_writer.convert_reader_to_recordio_file(
                 MNIST_RECORDIO_FILE, reader, feeder)
 
-    def _init_data(self, random=True):
+    def _init_data(self):
         np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
+        img = np.random.random(size=[32, 784]).astype(np.float32)
         label = np.ones(shape=[32, 1], dtype='int64')
         return img, label
 
-    def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True):
+    def _compare_reduce_and_allreduce(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
         self.check_network_convergence(
@@ -115,7 +112,7 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_network_convergence(
             model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
 
-        img, label = self._init_data(random_data)
+        img, label = self._init_data()
 
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             model,
@@ -166,27 +163,27 @@ class TestMNIST(TestParallelExecutorBase):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        img, label = self._init_data(random=False)
+        img, label = self._init_data()
 
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=True)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEqual(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEqual(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
         self.check_simple_fc_parallel_accuracy(True)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 4b4e5e689847bdc89858dd822a49320c5eb15ee3..672e94480a3f7a539030b9b489ff9dce9ab3ff40 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -21,6 +21,19 @@ from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import math
 import os
+import numpy as np
+
+# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
+# and Executor is different. Because, for ParallelExecutor, the dropout_op of
+# the neural net will be copied N copies(N is the number of device). This will
+# lead to the random numbers generated by ParallelExecutor and Executor are different.
+# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
+# dropout_op.
+remove_dropout = False
+
+# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
+# and Executor is different.
+remove_bn = False
 
 
 def squeeze_excitation(input, num_channels, reduction_ratio):
@@ -53,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
         groups=groups,
         act=None,
         bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+    return conv if remove_bn else fluid.layers.batch_norm(
+        input=conv, act=act, momentum=0.1)
 
 
 def shortcut(input, ch_out, stride):
@@ -92,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt50Small(batch_size=2, use_feed=False):
-    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+batch_size = 12
+img_shape = [3, 224, 224]
+
 
-    img = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+def SE_ResNeXt50Small(use_feed):
+
+    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     conv = conv_bn_layer(
         input=img, num_filters=16, filter_size=3, stride=2, act='relu')
@@ -127,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     reshape = fluid.layers.reshape(
         x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    dropout = pool if remove_dropout else fluid.layers.dropout(
+        x=pool, dropout_prob=0.2, seed=1)
     # Classifier layer:
     prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -135,75 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     return loss
 
 
-class TestResnet(TestParallelExecutorBase):
-    def check_resnet_convergence_with_learning_rate_decay(self,
-                                                          use_cuda=True,
-                                                          use_reduce=False,
-                                                          iter=20):
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """
+    Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
 
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
 
-        os.environ['CPU_NUM'] = str(4)
 
-        def _cosine_decay(learning_rate, step_each_epoch, epochs=120):
-            """
-            Applies cosine decay to the learning rate.
-            lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
-            """
-            global_step = _decay_step_counter()
+def optimizer(learning_rate=0.01):
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=cosine_decay(
+            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
 
-            with init_on_cpu():
-                epoch = ops.floor(global_step / step_each_epoch)
-                decayed_lr = learning_rate * \
-                            (ops.cos(epoch * (math.pi / epochs)) + 1)/2
-            return decayed_lr
 
-        def _optimizer(learning_rate=0.01):
-            optimizer = fluid.optimizer.Momentum(
-                learning_rate=_cosine_decay(
-                    learning_rate=learning_rate, step_each_epoch=2, epochs=1),
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
+class TestResnet(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        global remove_dropout
+        global remove_bn
+        remove_dropout = False
+        remove_bn = False
+
+    def _init_data(self, batch_size=2, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(
+                size=[batch_size] + img_shape).astype(np.float32)
+        else:
+            img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
+        label = [np.random.randint(0, 999) for _ in range(batch_size)]
+        label = np.array(label).astype(np.int64).reshape(-1, 1)
+        return img, label
+
+    def _compare_reduce_and_allreduce(self,
+                                      model,
+                                      use_cuda,
+                                      iter=20,
+                                      delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
-        import functools
+        global remove_bn
+        remove_bn = True
 
-        batch_size = 2
+        img, label = self._init_data(batch_size=batch_size)
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+    def _check_resnet_convergence(self,
+                                  model,
+                                  use_cuda=True,
+                                  use_reduce=False,
+                                  iter=20,
+                                  delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
+        global remove_dropout
+        global remove_bn
+        remove_dropout = True
+        remove_bn = True
+
+        img, label = self._init_data(batch_size=batch_size)
         single_first_loss, single_last_loss = self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
             use_reduce=use_reduce,
-            optimizer=_optimizer,
+            optimizer=optimizer,
             use_parallel_executor=False)
-
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
             use_reduce=use_reduce,
-            optimizer=_optimizer)
+            optimizer=optimizer)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEqual(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEqual(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
 
     def test_seresnext_with_learning_rate_decay(self):
-        self.check_resnet_convergence_with_learning_rate_decay(True, False)
-        self.check_resnet_convergence_with_learning_rate_decay(
-            False, False, iter=5)
-
-    def test_seresnext_with_new_strategy_with_learning_rate_decay(self):
-        self.check_resnet_convergence_with_learning_rate_decay(True, True)
-        self.check_resnet_convergence_with_learning_rate_decay(
-            False, True, iter=5)
+        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
+        self._check_resnet_convergence(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
+
+    def test_seresnext_with_new_strategy(self):
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 0ab581cfb0ea0ff2205450b8e62edb8bf3c51707..70ad05597c4a160cf6a25aeb3c379320cef69c63 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -26,15 +26,22 @@ def stable_softmax(x):
 
 
 class TestSoftmaxOp(OpTest):
+    def get_x_shape(self):
+        return [10, 10]
+
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
         self.use_mkldnn = False
         self.dtype = np.float32
         self.init_kernel_type()
+        self.shape = self.get_x_shape()
+
+        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1,
+                                  x.reshape([-1, self.shape[-1]]))
+        out = out.reshape(self.shape)
 
-        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, 1, x)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
         self.attrs = {
@@ -63,6 +70,11 @@ class TestSoftmaxOp(OpTest):
             self.check_grad(["X"], "Out", max_relative_error=0.01)
 
 
+class TestSoftmaxOp2(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
@@ -70,6 +82,13 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
         self.use_cudnn = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
@@ -83,6 +102,13 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
@@ -97,10 +123,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
+class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 4effc7dfda8b98191be03c34133d8f7dce18d479..04ff9d975dfc548eb164a80b9175f72b72ad0c35 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -294,14 +294,15 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        program.global_block().append_op(
-            type="fetch_barrier",
-            inputs={},
-            outputs={},
-            attrs={
-                "endpoints": pserver_endpoints,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
 
         for varname, splited_var in six.iteritems(self.param_var_mapping):
             if len(splited_var) <= 1:
@@ -897,8 +898,6 @@ class DistributeTranspiler(object):
             self.table_name
         ][0]
         table_opt_block = pserver_program.create_block(pre_block_idx)
-        # only support sgd now
-        assert table_opt_op.type == "sgd"
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -938,11 +937,12 @@ class DistributeTranspiler(object):
             "LearningRate": [lr_var]
         }
         outputs = {"ParamOut": [param_var]}
-        table_opt_block.append_op(
-            type=table_opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=table_opt_op.attrs)
+        # only support sgd now
+        import logging
+        logging.warn(
+            "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+            + table_opt_op.type)
+        table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs)
 
         # add table parameter gradient and it's block id to grad_to_block_id
         grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index d4e28f4b9193d0b233344c0acb39ef956a7007c7..b125eba4f83c588fa2fa81a357604a7d8592ea80 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -70,6 +70,8 @@ def switch(new_generator=None):
 def guard(new_generator=None):
     if isinstance(new_generator, six.string_types):
         new_generator = UniqueNameGenerator(new_generator)
+    elif isinstance(new_generator, six.binary_type):
+        new_generator = UniqueNameGenerator(new_generator.decode())
     old = switch(new_generator)
     yield
     switch(old)
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
index 369ff5e22b75cf33054d51f8075507d613d93468..c861020225fb6fe0a29653363c2151b20dc8f578 100644
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -73,6 +73,8 @@ def recordio(paths, buf_size=100):
     def reader():
         if isinstance(paths, six.string_types):
             path = paths
+        elif isinstance(paths, six.binary_type):
+            path = paths.decode()
         else:
             path = ",".join(paths)
         f = rec.reader(path)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc..8312900dc43fdd64cc1a205ab846b6f1deaecf5d 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 5104e29051e4480f3a7eb18421f1b519841b009b..1ec210f265049c8b62cd99cd218f25a9f846ef43 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
              'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
 MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"