Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into enhance-ReshapeOp

4bfbc591 · guosheng · c078ed46 · 25317bd3 · 4bfbc591 · 4bfbc591
77 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
@@ -180,6 +181,11 @@ if(WITH_GPU)
    include(cuda)
 endif(WITH_GPU)
+if(WITH_AMD_GPU)
+    find_package(HIP)
+    include(hip)
+endif(WITH_AMD_GPU)
 if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
-if(NOT WITH_GPU)
+if(WITH_GPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
    add_definitions(-DPADDLE_WITH_CUDA)
    FIND_PACKAGE(CUDA REQUIRED)
@@ -84,7 +80,14 @@ else()
    # Include cuda and cudnn
    include_directories(${CUDNN_INCLUDE_DIR})
    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+elseif(WITH_AMD_GPU)
+    add_definitions(-DPADDLE_WITH_HIP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()
 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
-ExternalProject_Add(
+if(WITH_AMD_GPU)
-    extern_eigen3
+    ExternalProject_Add(
-    ${EXTERNAL_PROJECT_LOG_ARGS}
+        extern_eigen3
-    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-    PREFIX          ${EIGEN_SOURCE_DIR}
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
-    UPDATE_COMMAND  ""
+        PREFIX          ${EIGEN_SOURCE_DIR}
-    CONFIGURE_COMMAND ""
+        UPDATE_COMMAND  ""
-    BUILD_COMMAND     ""
+        CONFIGURE_COMMAND ""
-    INSTALL_COMMAND   ""
+        BUILD_COMMAND     ""
-    TEST_COMMAND      ""
+        INSTALL_COMMAND   ""
-)
+        TEST_COMMAND      ""
+    )
+else()
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+endif()
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
-SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
 SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
  "PROJECT(MKLML)\n"
  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${MKLML_VER}\n"
+  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
  "        DESTINATION ${MKLML_DST_DIR})\n")
 ExternalProject_Add(

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME)
  endif()
 endfunction(nv_test)
+function(hip_library TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_SRCS)
+      if (hip_library_SHARED OR hip_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+	find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (hip_library_DEPS)
+	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_SRCS})
+	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
+    else(hip_library_SRCS)
+      if (hip_library_DEPS)
+	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+      else()
+	message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_SRCS)
+  endif()
+endfunction(hip_library)
+function(hip_binary TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+    if(hip_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+    endif()
+  endif()
+endfunction(hip_binary)
+function(hip_test TARGET_NAME)
+  if (WITH_AMD_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_test_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(hip_test)
 function(go_library TARGET_NAME)
  set(options STATIC static SHARED shared)
  set(oneValueArgs "")

--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
+if(NOT WITH_AMD_GPU)
+    return()
+endif()
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+if(WITH_DSO)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+if(WITH_DOUBLE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+if(WITH_TESTING)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND)
      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
      DSTS ${dst_dir} ${dst_dir}
    )
+elseif (WITH_MKLML)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+    copy(mklml_lib
+      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+    )
 endif()
 # paddle fluid module

--- a/doc/fluid/design/concurrent/go_op.md
+++ b/doc/fluid/design/concurrent/go_op.md
+# go_op Design
+## Introduction
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread.  It works in conjuction with CSP operators (channel_send, 
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+## How to use it
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+with fluid.Go():
+    # Send a tensor of value 99 to "channel" on a detached thread
+    tensor = fill_constant(shape=[1], dtype='int', value=99)
+    tensor.stop_gradient = True
+    fluid.channel_send(channel, tensor)
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)    
+fluid.channel_recv(ch, result)  
+```
+The go operator can be accessed by using the fluid.Go() control flow.  This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient 
+calculations.
+## How it Works
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block.  Operators and variables defined in this
+block will be added to the go sub_block.
+In addition, the go operator will create a new child scope whose parent is
+the global scope.  Please refer to [block captures](#block-captures) for more
+information.
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+An example of the generated program description is shown below.  Take note of
+the **go_op** in particular.  It is added as an operator in the current 
+block (in this example, block0).  The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a 
+detached thread.
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "return_value"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: INT64
+        }
+      }
+    }
+  }
+  vars {
+    name: "status_recv"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "channel"
+    }
+    type: "channel_create"
+    attrs {
+      name: "data_type"
+      type: INT
+      i: 7
+    }
+    attrs {
+      name: "capacity"
+      type: INT
+      i: 0
+    }
+  }
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "channel"
+    }
+    type: "go"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "return_value"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status_recv"
+    }
+    type: "channel_recv"
+  }
+  ...
+}
+blocks {
+  idx: 1
+  parent_idx: 0
+  vars {
+    name: "status"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 99.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 3
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status"
+    }
+    type: "channel_send"
+    attrs {
+      name: "copy"
+      type: BOOLEAN
+      b: false
+    }
+  }
+```
+## Current Limitations
+#### <a name="block-captures"></a>Scopes and block captures:
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block.  When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new 
+child block.  After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited.  If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault.  As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed.  Currently, the go_op will explicitly enforce 
+this requirement and raise an exception if a variable could not be found in 
+the scope.
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+#### Green Threads
+Golang utilizes `green threads`, which is a mechnism for the runtime library to 
+manage multiple threads (instead of natively by the OS).  Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads.  For the first version of CSP, we only support
+OS threads.
+#### Backward Propegation:
+go_op currently does not support backwards propagation.  Please use go_op with
+non training operators.
--- a/doc/v2/dev/index_en.rst
+++ b/doc/v2/dev/index_en.rst
 Development
 ------------
+PaddlePaddle adheres to the following three sections of code and document specifications.
+PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
 ..  toctree::
  :maxdepth: 1
  contribute_to_paddle_en.md
+PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
+..  toctree::
+  :maxdepth: 1
  write_docs_en.rst
+PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
+..  toctree::
+  :maxdepth: 1
  new_layer_en.rst
--- a/doc/v2/howto/index_en.rst
+++ b/doc/v2/howto/index_en.rst
@@ -6,32 +6,32 @@ PaddlePaddle provides the users the ability to flexibly set various command line
 ..  toctree::
  :maxdepth: 1
-  cmd_parameter/index_cn.rst
+  cmd_parameter/index_en.rst
 PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
 ..  toctree::
  :maxdepth: 1
-  cluster/index_cn.rst
+  cluster/index_en.rst
 PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
 ..  toctree::
  :maxdepth: 1
-  capi/index_cn.rst
+  capi/index_en.rst
 PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
 ..  toctree::
  :maxdepth: 1
-  rnn/index_cn.rst
+  rnn/index_en.rst
 How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
 ..  toctree::
  :maxdepth: 1
-  optimization/gpu_profiling_cn.rst
+  optimization/gpu_profiling_en.rst
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -34,7 +34,7 @@ class Channel {
 public:
  virtual bool CanSend() = 0;
  virtual bool CanReceive() = 0;
-  virtual bool Send(T*) = 0;
+  virtual void Send(T*) = 0;
  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
  virtual void Lock() = 0;
@@ -84,69 +84,81 @@ class ChannelHolder {
  }
  template <typename T>
-  bool Send(T* data) {
+  void Send(T* data) {
-    if (!IsInitialized()) return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
    // Static cast should be safe because we have ensured that types are same
    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Send(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    channel->Send(data);
  }
  template <typename T>
  bool Receive(T* data) {
-    if (!IsInitialized()) return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Receive(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    return channel->Receive(data);
  }
  bool IsClosed() {
-    if (IsInitialized()) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-      return holder_->IsClosed();
+                      "The Channel hasn't been initialized");
-    }
+    return holder_->IsClosed();
-    return false;
  }
  bool CanSend() {
-    if (IsInitialized()) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-      return holder_->CanSend();
+                      "The Channel hasn't been initialized");
-    }
+    return holder_->CanSend();
-    return false;
  }
  bool CanReceive() {
-    if (IsInitialized()) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-      return holder_->CanReceive();
+                      "The Channel hasn't been initialized");
-    }
+    return holder_->CanReceive();
-    return false;
  }
  void close() {
-    if (IsInitialized()) holder_->Close();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Close();
  }
  size_t Cap() {
-    if (IsInitialized()) return holder_->Cap();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-    return -1;
+                      "The Channel hasn't been initialized");
+    return holder_->Cap();
  }
  void Lock() {
-    if (IsInitialized()) holder_->Lock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Lock();
  }
  void Unlock() {
-    if (IsInitialized()) holder_->Unlock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Unlock();
  }
  template <typename T>
  void AddToSendQ(const void* referrer, T* data,
                  std::shared_ptr<std::condition_variable_any> cond,
                  std::function<bool(ChannelAction)> cb) {
-    if (IsInitialized()) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+                      "The Channel hasn't been initialized");
-      if (channel != nullptr) {
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-        channel->AddToSendQ(referrer, data, cond, cb);
+    if (channel != nullptr) {
-      }
+      channel->AddToSendQ(referrer, data, cond, cb);
    }
  }
@@ -154,26 +166,31 @@ class ChannelHolder {
  void AddToReceiveQ(const void* referrer, T* data,
                     std::shared_ptr<std::condition_variable_any> cond,
                     std::function<bool(ChannelAction)> cb) {
-    if (IsInitialized()) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+                      "The Channel hasn't been initialized");
-      if (channel != nullptr) {
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-        channel->AddToReceiveQ(referrer, data, cond, cb);
+    if (channel != nullptr) {
-      }
+      channel->AddToReceiveQ(referrer, data, cond, cb);
    }
  }
  void RemoveFromSendQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromSendQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromSendQ(referrer);
  }
  void RemoveFromReceiveQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromReceiveQ(referrer);
  }
  inline bool IsInitialized() const { return holder_ != nullptr; }
  inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
    return holder_->Type();
  }

--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
 public:
  virtual bool CanSend();
  virtual bool CanReceive();
-  virtual bool Send(T *);
+  virtual void Send(T *);
  virtual bool Receive(T *);
  virtual size_t Cap() { return cap_; }
  virtual void Lock();
@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> {
    }
  };
-  bool send_return(bool value) {
+  void send_return() {
    send_ctr--;
    destructor_cond_.notify_all();
-    return value;
  }
  bool recv_return(bool value) {
@@ -118,15 +117,15 @@ bool ChannelImpl<T>::CanReceive() {
 }
 template <typename T>
-bool ChannelImpl<T>::Send(T *item) {
+void ChannelImpl<T>::Send(T *item) {
  send_ctr++;
  std::unique_lock<std::recursive_mutex> lock{mu_};
-  // If channel is closed, do nothing
+  // If channel is closed, throw exception
  if (closed_) {
    lock.unlock();
-    // TODO(abhinavarora) Should panic on closed channel
+    send_return();
-    return send_return(false);
+    PADDLE_THROW("Cannot send on closed channel");
  }
  // If there is a receiver, directly pass the value we want
@@ -143,7 +142,7 @@ bool ChannelImpl<T>::Send(T *item) {
    if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
    if (do_send)
      *(m->data) = std::move(*item);
-    else
+    else {
      // We cannot do the data transfer because
      // this QueueMessage was added by Select
      // and some other case was executed.
@@ -151,12 +150,17 @@ bool ChannelImpl<T>::Send(T *item) {
      // We do not care about notifying other
      // because they would have been notified
      // by the executed select case.
-      return send_return(Send(item));
+      lock.unlock();
+      Send(item);
+      send_return();
+      return;
+    }
    // Wake up the blocked process and unlock
    m->Notify();
    lock.unlock();
-    return send_return(true);
+    send_return();
+    return;
  }
  // Unbuffered channel will always bypass this
@@ -167,7 +171,8 @@ bool ChannelImpl<T>::Send(T *item) {
    buf_.push_back(std::move(*item));
    // Release lock and return true
    lock.unlock();
-    return send_return(true);
+    send_return();
+    return;
  }
  // Block on channel, because some receiver will complete
@@ -175,8 +180,12 @@ bool ChannelImpl<T>::Send(T *item) {
  auto m = std::make_shared<QueueMessage>(item);
  sendq.push_back(m);
  m->Wait(lock);
-  // TODO(abhinavarora) Should panic on closed channel
+  if (m->chan_closed) {
-  return send_return(!m->chan_closed);
+    lock.unlock();
+    send_return();
+    PADDLE_THROW("Cannot send on closed channel");
+  }
+  send_return();
 }
 template <typename T>

--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <chrono>
 #include <thread>
 #include "gtest/gtest.h"
 using paddle::framework::Channel;
@@ -41,7 +40,7 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
  unsigned sum_send = 0;
  std::thread t([&]() {
    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+      ch->Send(&i);
      sum_send += i;
    }
  });
@@ -61,7 +60,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
  const size_t buffer_size = 10;
  auto ch = MakeChannel<size_t>(buffer_size);
  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);
  }
  size_t out;
@@ -82,7 +81,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
  const size_t data = 5;
  std::thread send_thread{[&]() {
    size_t i = data;
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);  // should not block
  }};
  std::thread recv_thread{[&]() {
@@ -94,12 +93,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
  send_thread.join();
  recv_thread.join();
-  // After closing send should return false. Receive should
+  // After closing send should panic. Receive should
-  // also return false as there is no data in queue.
+  // also  false as there is no data in queue.
  CloseChannel(ch);
  send_thread = std::thread{[&]() {
    size_t i = data;
-    EXPECT_EQ(ch->Send(&i), false);  // should return false
+    bool is_exception = false;
+    try {
+      ch->Send(&i);
+    } catch (paddle::platform::EnforceNotMet e) {
+      is_exception = true;
+    }
+    EXPECT_EQ(is_exception, true);
  }};
  recv_thread = std::thread{[&]() {
    size_t i;
@@ -129,7 +134,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
  auto ch = MakeChannel<size_t>(buffer_size);
  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+    ch->Send(&i);  // sending should not block
  }
  size_t out;
@@ -160,9 +165,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
    // Try to write more than buffer size.
    for (size_t i = 0; i < 2 * buffer_size; ++i) {
      if (i < buffer_size)
-        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+        ch->Send(&i);  // should block after 10 iterations
-      else
+      else {
-        EXPECT_EQ(ch->Send(&i), false);
+        bool is_exception = false;
+        try {
+          ch->Send(&i);
+        } catch (paddle::platform::EnforceNotMet e) {
+          is_exception = true;
+        }
+        EXPECT_EQ(is_exception, true);
+      }
    }
  });
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
@@ -231,7 +243,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
    t[i] = std::thread(
        [&](bool *ended, bool *success) {
          int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
          *ended = true;
        },
        &thread_ended[i], &send_success[i]);
@@ -316,8 +334,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
    // Try to send more number of times
    // than receivers
    for (int i = 0; i < 4; i++) {
-      ch->Send(&i);
+      try {
-      sum_send += i;
+        ch->Send(&i);
+        sum_send += i;
+      } catch (paddle::platform::EnforceNotMet e) {
+      }
    }
  });
  for (int i = 0; i < 3; i++) {
@@ -382,7 +403,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
    t[i] = std::thread(
        [&](bool *ended, bool *success) {
          int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
          *ended = true;
        },
        &thread_ended[i], &send_success[i]);
@@ -508,7 +535,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
  unsigned sum_send = 0;
  std::thread t([&]() {
    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+      ch->Send(&i);
      sum_send += i;
    }
  });
@@ -541,8 +568,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
  ChannelHolder *ch = new ChannelHolder();
  EXPECT_EQ(ch->IsInitialized(), false);
  int i = 10;
-  EXPECT_EQ(ch->Send(&i), false);
+  bool send_exception = false;
-  EXPECT_EQ(ch->Receive(&i), false);
+  try {
+    ch->Send(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    send_exception = true;
+  }
+  EXPECT_EQ(send_exception, true);
+  bool recv_exception = false;
+  try {
+    ch->Receive(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    recv_exception = true;
+  }
+  EXPECT_EQ(recv_exception, true);
  bool is_exception = false;
  try {
    ch->Type();
@@ -669,7 +710,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
    t[i] = std::thread(
        [&](bool *ended, bool *success) {
          int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
          *ended = true;
        },
        &thread_ended[i], &send_success[i]);
@@ -760,7 +807,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
    t[i] = std::thread(
        [&](bool *ended, bool *success) {
          int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
          *ended = true;
        },
        &thread_ended[i], &send_success[i]);

--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
  // Select block
  AddOp("select", {{"X", {dataChanName, quitChanName}},
                   {"case_to_execute", {"caseToExecute"}}},
-        {}, {{"sub_block", casesBlock},
+        {{"Out", {}}},
-             {"cases", std::vector<std::string>{case0Config, case1Config}}},
+        {{"sub_block", casesBlock},
+         {"cases", std::vector<std::string>{case0Config, case1Config}}},
        whileBlock);
  scope->Var("stepScopes");
@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {
  executor.Run(program, &scope, 0, true, true);
-  // After we call executor.run, the Go operator should do a channel_send to set
+  // After we call executor.run, the Go operator should do a channel_send to
-  // the
+  // set the "result" variable to 99.
-  // "result" variable to 99
  auto *finalData = tensor.data<int>();
  EXPECT_EQ(finalData[0], 99);
 }

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -45,10 +45,11 @@ class Tensor {
  friend struct EigenVector;
 public:
-  Tensor() : offset_(0) {}
+  Tensor() : offset_(0), is_pinned_(false) {}
  /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place) : offset_(0) {
+  explicit Tensor(const platform::Place& place)
+      : offset_(0), is_pinned_(false) {
    holder_->set_place(place);
  }
@@ -69,11 +70,12 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(platform::Place place);
+  inline T* mutable_data(platform::Place place, bool is_pinned = false);
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  inline void* mutable_data(platform::Place place, std::type_index type,
+                            bool is_pinned = false);
-  inline void* mutable_data(platform::Place place);
+  inline void* mutable_data(platform::Place place, bool is_pinned = false);
  /**
   * @brief     Return a pointer to mutable memory block.
@@ -84,7 +86,8 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  inline T* mutable_data(DDim dims, platform::Place place,
+                         bool is_pinned = false);
  /*! Return the dimensions of the memory block. */
  inline const DDim& dims() const;
@@ -92,6 +95,9 @@ class Tensor {
  /*! Return the numel of the memory block. */
  inline int64_t numel() const;
+  /*! Return the numel of the memory block. */
+  inline bool isPinned() const;
  /*! Resize the dimensions of the memory block. */
  inline Tensor& Resize(const DDim& dims);
@@ -146,12 +152,14 @@ class Tensor {
  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type)
+    PlaceholderImpl(Place place, size_t size, std::type_index type,
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+                    bool is_pinned = false)
-               memory::PODDeleter<uint8_t, Place>(place)),
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
+               memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
          place_(place),
          size_(size),
-          type_(type) {
+          type_(type),
+          is_pinned_(is_pinned) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
@@ -174,6 +182,9 @@ class Tensor {
    /* the current type of memory */
    std::type_index type_;
+    /*! use pinned memory or not. */
+    bool is_pinned_;
  };
  /*! holds the memory block if allocated. */
@@ -208,6 +219,7 @@ class Tensor {
   *          PlaceHolder::ptr_ and where the tensor data really begins.
   */
  size_t offset_;
+  bool is_pinned_;
 };
 inline void Tensor::switch_place(platform::Place new_place) {

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -101,52 +101,55 @@ inline T* Tensor::data() {
 }
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               bool is_pinned) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place);
+  return mutable_data<T>(place, is_pinned);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
+inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
 }
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                                  bool is_pinned) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
-  PADDLE_ENFORCE_GT(
+  PADDLE_ENFORCE_GE(numel(), 0,
-      numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
-      "When calling this method, the Tensor's numel must be larger than zero. "
+                    "equal or larger than zero. "
-      "Please check Tensor::Resize has been called first.");
+                    "Please check Tensor::Resize has been called first.");
  int64_t size = numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
+          boost::get<platform::CPUPlace>(place), size, type, is_pinned));
    } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
    }
 #else
      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type));
+          boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
    }
 #endif
    offset_ = 0;
+    is_pinned_ = is_pinned;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
-inline void* Tensor::mutable_data(platform::Place place) {
+inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type());
+  return mutable_data(place, holder_->type(), is_pinned);
 }
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; }
 inline int64_t Tensor::numel() const { return product(dims_); }
+inline bool Tensor::isPinned() const { return is_pinned_; }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;
  res.ShareDataWith(src);

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -119,6 +119,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
 bool GPUAllocator::UseGpu() const { return true; }
+// PINNED memory allows direct DMA transfers by the GPU to and from system
+// memory. It’s locked to a physical address.
+void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
+  if (size <= 0) return nullptr;
+  void* p;
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host pinned allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+  if (size > usable) return nullptr;
+  // PINNED memory is visible to all CUDA contexts.
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
+  }
+  return nullptr;
+}
+void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+  PADDLE_ASSERT(index == 1);
+  PADDLE_ASSERT(fallback_alloc_size_ >= size);
+  fallback_alloc_size_ -= size;
+  err = cudaFreeHost(p);
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFreeHost after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFreeHost succeeds.
+  if (err != cudaErrorCudartUnloading) {
+    PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
+  }
+}
+bool CUDAPinnedAllocator::UseGpu() const { return true; }
 #endif
 }  // namespace detail

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator {
  size_t gpu_alloc_size_ = 0;
  size_t fallback_alloc_size_ = 0;
 };
+class CUDAPinnedAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ =
+      0;  // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
+  size_t fallback_alloc_size_ = 0;
+};
 #endif
 }  // namespace detail

--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 }
 template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
+                                bool is_pinned) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void* p = GetCPUBuddyAllocator()->Alloc(size);
  VLOG(10) << "  pointer=" << p;
@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
 }
 template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
+                              bool is_pinned) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }
@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  return as[gpu_id];
 }
+BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      as[gpu] = nullptr;
+    }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
+    VLOG(10) << "\n\nNOTE: each GPU device use "
+             << FLAGS_fraction_of_gpu_memory_to_use * 100
+             << "% of GPU memory.\n"
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
+             << "' to change the fraction of GPU usage.\n\n";
+  }
+  return as[gpu_id];
+}
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
  return GetGPUBuddyAllocator(place.device)->Used();
 }
 template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
-  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+                                 bool is_pinned) {
-  auto* ptr = buddy_allocator->Alloc(size);
+  void* ptr;
+  if (is_pinned) {
+    auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
+    ptr = buddy_allocator->Alloc(size);
+  } else {
+    auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+    ptr = buddy_allocator->Alloc(size);
+  }
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
 }
 template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
-  GetGPUBuddyAllocator(place.device)->Free(p);
+                               bool is_pinned) {
+  if (is_pinned) {
+    GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
+  } else {
+    GetGPUBuddyAllocator(place.device)->Free(p);
+  }
 }
 #endif

--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -33,7 +33,7 @@ namespace memory {
 *          address is valid or not.
 */
 template <typename Place>
-void* Alloc(Place place, size_t size);
+void* Alloc(Place place, size_t size, bool is_pinned = false);
 /**
 * \brief   Free memory block in one place.
@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
 *
 */
 template <typename Place>
-void Free(Place place, void* ptr);
+void Free(Place place, void* ptr, bool is_pinned = false);
 /**
 * \brief   Total size of used memory in one place.
@@ -74,11 +74,13 @@ class PODDeleter {
  static_assert(std::is_pod<T>::value, "T must be POD");
 public:
-  explicit PODDeleter(Place place) : place_(place) {}
+  explicit PODDeleter(Place place, bool is_pinned = false)
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+      : place_(place), is_pinned_(is_pinned) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
 private:
  Place place_;
+  bool is_pinned_;
 };
 /**

--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
  EXPECT_EQ(total_size, 0UL);
  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
    ps[paddle::memory::Alloc(cpu, size)] = size;
    // Buddy Allocator doesn't manage too large memory chunk
@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
  EXPECT_EQ(total_size, 0UL);
  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
    ps[paddle::memory::Alloc(gpu, size)] = size;
    // Buddy Allocator doesn't manage too large memory chunk

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -12,6 +12,8 @@ function(op_library TARGET)
    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
    set(cc_srcs)
    set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
@@ -36,10 +38,19 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
            list(APPEND cu_srcs ${TARGET}.cu)
        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
        if(WITH_MKLDNN)
            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
@@ -48,10 +59,14 @@ function(op_library TARGET)
        endif()
    else()
        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.cu$")
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
@@ -76,6 +91,9 @@ function(op_library TARGET)
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
    else()
        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
@@ -88,7 +106,7 @@ function(op_library TARGET)
        endif()
    endforeach()
-    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
    file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -114,7 +132,10 @@ function(op_library TARGET)
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -125,9 +146,19 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
    endif()
    # pybind USE_OP
@@ -156,9 +187,13 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 op_library(cond_op DEPS framework_proto tensor net_op)
@@ -229,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "mkldnn.hpp"
+#include "mkldnn_activation_op.h"
+#include "paddle/fluid/operators/activation_op.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+namespace {
+template <typename T, typename ExecContext>
+void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                     const T alpha = 0, const T beta = 0) {
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                 "It must use CPUPlace.");
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+  // get buffers
+  const auto *src = ctx.template Input<Tensor>("X");
+  const auto *src_data = src->template data<T>();
+  auto *dst = ctx.template Output<Tensor>("Out");
+  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  // get memory dim
+  PADDLE_ENFORCE(src->dims().size() == 4,
+                 "Input dim must be with 4, i.e. NCHW");
+  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+  // create memory description
+  // TODO(kbinias-intel): support more formats
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
+  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+  auto forward_desc = mkldnn::eltwise_forward::desc(
+      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
+  // save prim desc into global device context to be referred in backward path
+  const std::string key = ctx.op().Output("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+      forward_desc, mkldnn_engine);
+  dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
+  auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+template <typename T, typename ExecContext>
+void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                  const T alpha = 0, const T beta = 0) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+  // get buffers
+  const auto *x = ctx.template Input<Tensor>("X");
+  const auto *src = x->template data<T>();
+  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+  const auto *diff_dst = dout->template data<T>();
+  auto *dx =
+      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+  // get memory dim
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+  // create memory description
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto diff_src_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+  auto diff_dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+  auto backward_desc =
+      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
+  // retrieve eltwise primitive desc from device context
+  const std::string key = ctx.op().Input("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
+  PADDLE_ENFORCE(forward_pd != nullptr,
+                 "Fail to find eltwise_pd in device context");
+  auto *p_forward_pd =
+      static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
+  auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+      backward_desc, mkldnn_engine, *p_forward_pd);
+  auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
+                                              diff_dst_memory, diff_src_memory);
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+}  // anonymous namespace
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_forward<T>(ctx, algorithm);
+  }
+};
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_grad<T>(ctx, algorithm);
+  }
+};
+template <typename T>
+using ReluMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
+template <typename T>
+using TanhMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
+template <typename T>
+using SqrtMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+template <typename T>
+using AbsMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
+template <typename T>
+using ReluMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
+template <typename T>
+using TanhMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
+template <typename T>
+using SqrtMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+template <typename T>
+using AbsMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,       \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);    \
+  REGISTER_OP_KERNEL(                                                      \
+      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
+  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
+  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
+  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
+  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"
 namespace paddle {
 namespace operators {
@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu operator");
    AddOutput("Out", "Output of Relu operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Relu Activation Operator.
@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Out", "Output of Tanh operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Tanh Activation Operator.
@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Out", "Output of Sqrt operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Sqrt Activation Operator.
@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Abs operator");
    AddOutput("Out", "Output of Abs operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Abs Activation Operator.
@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
            ops::ActivationOpGrad);
-REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
+REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
+REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
            tanh_shrink_grad, ops::ActivationOpGrad);
@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
 REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
            softshrink_grad, ops::ActivationOpGrad);
-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
+REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
+REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
 REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
            ops::ActivationOpGrad);

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
@@ -23,21 +23,10 @@ limitations under the License. */
 static constexpr char Channel[] = "Channel";
 static constexpr char X[] = "X";
-static constexpr char Status[] = "Status";
-static constexpr char copy[] = "copy";
 namespace paddle {
 namespace operators {
-void SetSendStatus(const platform::Place &dev_place,
-                   framework::Variable &status_var, bool status) {
-  auto cpu = platform::CPUPlace();
-  auto status_tensor =
-      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                        cpu);
-  status_tensor[0] = status;
-}
 class ChannelSendOp : public framework::OperatorBase {
 public:
  ChannelSendOp(const std::string &type,
@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase {
                   "Input(Channel) of ChannelSendOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput(X),
                   "Input(X) of ChannelSendOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Status),
-                   "Output(Status) of ChannelSendOp should not be null.");
-    ctx->SetOutputDim("Status", {1});
  }
 private:
@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase {
    auto input_var = scope.FindVar(Input(X));
    // Send the input data through the channel.
-    bool ok = concurrency::ChannelSend(ch, input_var);
+    concurrency::ChannelSend(ch, input_var);
-    // Set the status output of the `ChannelSend` call.
-    SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
  }
 };
@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
        .AsDuplicable();
    AddInput(X, "(Variable) The value which gets sent by the channel.")
        .AsDuplicable();
-    AddOutput(Status,
-              "(Tensor) An LoD Tensor that returns a boolean status of the"
-              "result of the send operation.")
-        .AsDuplicable();
-    AddAttr<bool>(copy, "(bool, default false) Should copy before send")
-        .SetDefault(false);
    AddComment(R"DOC(
 )DOC");
  }

--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
@@ -17,20 +17,20 @@ limitations under the License. */
 namespace poc = paddle::operators::concurrency;
-bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
  auto type = framework::ToVarType(var->Type());
  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Send(var->GetMutable<framework::LoDTensor>());
+    ch->Send(var->GetMutable<framework::LoDTensor>());
  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Send(var->GetMutable<framework::LoDRankTable>());
+    ch->Send(var->GetMutable<framework::LoDRankTable>());
  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Send(var->GetMutable<framework::LoDTensorArray>());
+    ch->Send(var->GetMutable<framework::LoDTensorArray>());
  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Send(var->GetMutable<framework::SelectedRows>());
+    ch->Send(var->GetMutable<framework::SelectedRows>());
  else if (type == framework::proto::VarType_Type_READER)
-    return ch->Send(var->GetMutable<framework::ReaderHolder>());
+    ch->Send(var->GetMutable<framework::ReaderHolder>());
  else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Send(var->GetMutable<framework::ChannelHolder>());
+    ch->Send(var->GetMutable<framework::ChannelHolder>());
  else
    PADDLE_THROW("ChannelSend:Unsupported type");
 }

--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ b/paddle/fluid/operators/concurrency/channel_util.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 namespace concurrency {
-bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
+void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
 bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
 void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,

--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -146,8 +146,9 @@ class GrpcByteBufferSource
 class GrpcByteBufferSourceWrapper : public Source {
 public:
-  GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {}
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
    return source_;
  }

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -49,9 +49,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
    s->Prepare(var_h, time_out);
    s->response_call_back_ = NULL;
-    auto call = std::move(s->stub_g_.PrepareUnaryCall(
+    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req,
+        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
-        &cq_));
    call->StartCall();
    call->Finish(&s->reply_, &s->status_, (void*)s);
  });
@@ -107,8 +106,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
    ::grpc::ByteBuffer buf;
    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-    auto call = std::move(s->stub_g_.PrepareUnaryCall(
+    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_));
+        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
    call->StartCall();
    call->Finish(&s->reply_, &s->status_, (void*)s);
  });

--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -21,15 +21,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/grpc_service.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/grpc_service.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
-//#include <grpc/support/log.h>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
  EXPECT_EQ(resp.Parse(msg), 0);
  framework::Variable* var2 = resp.GetVar();
@@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  // deserialize zero-copy
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
  if (from_type == 0) {
    EXPECT_EQ(resp.Parse(msg), 0);
  } else {
@@ -194,24 +194,23 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
 }
-TEST(LodTensor, GPU) {
+TEST(LodTensor, Run) {
-  platform::CUDAPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
-}
-TEST(LodTensor, CPU) {
  platform::CPUPlace place;
  RunTestLodTensor(place);
  RunTestLodTensor(place, 1);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+  RunTestLodTensor(gpu, 1);
+#endif
 }
-TEST(SelectedRows, CPU) {
+TEST(SelectedRows, Run) {
  platform::CPUPlace place;
  RunSerdeTestSelectedRows(place);
-}
-TEST(SelectedRows, GPU) {
+#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace place;
+  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(place);
+  RunSerdeTestSelectedRows(gpu);
+#endif
 }
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -36,9 +36,9 @@ class VariableResponse {
 public:
  VariableResponse(const framework::Scope* scope,
                   const platform::DeviceContext* dev_ctx)
-      : scope_(scope), dev_ctx_(dev_ctx){};
+      : scope_(scope), dev_ctx_(dev_ctx) {}
-  virtual ~VariableResponse(){};
+  virtual ~VariableResponse() {}
  // return:
  // 0:ok.

--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -33,6 +33,7 @@ __global__ void RandomGenerator(const size_t n, const int seed,
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    rng.discard(idx);
    if (dist(rng) < dropout_prob) {
      mask_data[idx] = static_cast<T>(0);
    } else {
@@ -54,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
    y->mutable_data<T>(context.GetPlace());
    float dropout_prob = context.Attr<float>("dropout_prob");
-    auto X = EigenMatrix<T>::Reshape(*x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
    auto& place = *context.template device_context<Place>().eigen_device();
    if (!context.Attr<bool>("is_test")) {
      auto* mask = context.Output<Tensor>("Mask");
@@ -75,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
          size, seed, dropout_prob, x_data, mask_data, y_data);
    } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
    }
  }

--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <unistd.h>
+#include <string>
+#include <thread>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+USE_OP(dropout);
+void Compare(f::Scope& scope, p::DeviceContext& ctx) {
+  // init
+  auto var = scope.Var("X");
+  auto tensor = var->GetMutable<f::LoDTensor>();
+  tensor->Resize({10, 10});
+  std::vector<float> init;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init.push_back(1.0);
+  }
+  TensorFromVector(init, ctx, tensor);
+  auto place = ctx.GetPlace();
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  out_tensor->mutable_data<float>(place);  // allocate
+  auto mask_var = scope.Var("Mask");
+  auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
+  mask_tensor->Resize({10, 10});
+  mask_tensor->mutable_data<float>(place);  // allocate
+  // run
+  f::AttributeMap attrs;
+  float dropout_prob = 0.5;
+  attrs.insert({"fix_seed", 1});
+  attrs.insert({"seed", 3});
+  attrs.insert({"dropout_prob", dropout_prob});
+  auto dropout_op = f::OpRegistry::CreateOp(
+      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
+  dropout_op->Run(scope, place);
+  std::vector<float> out_vec;
+  TensorToVector(*out_tensor, ctx, &out_vec);
+  std::vector<float> std_out = {
+      0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
+      1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
+      1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
+      1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
+  EXPECT_EQ(out_vec.size(), std_out.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], std_out[i]);
+  }
+}
+TEST(Dropout, CPUDense) {
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  Compare(scope, ctx);
+}
+TEST(Dropout, GPUDense) {
+  f::Scope scope;
+  p::CUDAPlace place;
+  p::CUDADeviceContext ctx(place);
+  Compare(scope, ctx);
+}
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -22,6 +22,103 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
+};
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class RowwiseMean2D<platform::CUDADeviceContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    framework::DDim ones_dim({right_});
+    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
+    math::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    math::gemv<platform::CUDADeviceContext, T>(
+        context, false, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
+        0., out->data<T>());
+  }
+ private:
+  int left_;
+  int right_;
+  framework::Tensor divisor_;
+};
+#endif
+template <typename T>
+class RowwiseMean2D<platform::CPUDeviceContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    row_mean_(context, input, out);
+  }
+ private:
+  math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
+};
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
+};
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class ColwiseSum2D<platform::CUDADeviceContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    framework::DDim ones_dim({left_});
+    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
+    math::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    math::gemv<platform::CUDADeviceContext, T>(
+        context, true, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
+        0., out->data<T>());
+  }
+ private:
+  int left_;
+  int right_;
+  framework::Tensor divisor_;
+};
+#endif
+template <typename T>
+class ColwiseSum2D<platform::CPUDeviceContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    col_wise_(context, input, out);
+  }
+ private:
+  math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
+};
 template <typename T>
 struct SubAndSquareFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout;
 template <typename DeviceContext, typename T>
 class LayerNormKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
+    auto* scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
+    auto* bias = ctx.Input<Tensor>("Bias");
    auto x = *ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
+    auto* y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
+    auto* mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
+    auto* var = ctx.Output<Tensor>("Variance");
    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
    const auto x_dims = x.dims();
@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
    out.ShareDataWith(*y);
    out.Resize(matrix_shape);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::RowwiseMean<DeviceContext, T> row_mean;
+    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
    // get mean
    row_mean(dev_ctx, x, mean);
@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class LayerNormGradKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    const float epsilon = ctx.Attr<float>("epsilon");
    auto x = *ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<Tensor>("Y");
+    auto* y = ctx.Input<Tensor>("Y");
-    auto *mean = ctx.Input<Tensor>("Mean");
+    auto* mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
+    auto* var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
+    auto* scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
+    auto* bias = ctx.Input<Tensor>("Bias");
    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    const auto &x_dims = x.dims();
+    const auto& x_dims = x.dims();
    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
    int left = static_cast<int>(matrix_dim[0]);
    int right = static_cast<int>(matrix_dim[1]);
    framework::DDim matrix_shape({left, right});
    d_y.Resize(matrix_shape);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::ColwiseSum<DeviceContext, T> colwise_sum;
+    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
+                                               ctx.device_context());
    Tensor temp;
    Tensor temp_norm;
@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
      Tensor temp_vec;
      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
-      math::RowwiseMean<DeviceContext, T> row_mean;
+      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
+                                               ctx.device_context());
      if (d_scale) {
        // dy_dx

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -139,26 +139,25 @@ class ListenAndServOp : public framework::OperatorBase {
      // should be global ops.
      // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
      // and this will still work.
      std::vector<std::future<void>> fs;
      // block0 contains only listen_and_serv op, start run from block1.
      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
-        fs.push_back(framework::Async([&executor, &program, &recv_scope,
+        fs.push_back(
-                                       blkid]() {
+            framework::Async([&executor, &program, &recv_scope, blkid]() {
-          int run_block = blkid;  // thread local
+              int run_block = blkid;  // thread local
-          try {
+              try {
-            executor.Run(*program, &recv_scope, run_block,
+                executor.Run(*program, &recv_scope, run_block, false, false);
-                         false /*create_local_scope*/, false /*create_vars*/);
+              } catch (std::exception &e) {
-          } catch (std::exception &e) {
+                LOG(ERROR) << "run sub program error " << e.what();
-            LOG(ERROR) << "run sub program error " << e.what();
+              }
-          }
+            }));
-        }));
      }
      for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
      // Run global block at final step, or block1 if there are only 2 blocks
      if (num_blocks >= 2) {
        try {
-          executor.Run(*program, &recv_scope, num_blocks - 1,
+          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
-                       false /*create_local_scope*/, false /*create_vars*/);
        } catch (std::exception &e) {
          LOG(ERROR) << "run sub program error " << e.what();
        }
@@ -177,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase {
      rpc_service_->WaitClientGet(fan_in);
      sparse_vars.clear();
    }  // while(true)
+    // for (int i = 0; i < num_blocks; ++i) {
+    //   delete blk_ctx_list[i];
+    // }
  }
 protected:

--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/lrn_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+namespace {
+template <typename T, typename... Args>
+std::shared_ptr<T> insert_to_context(const std::string& key,
+                                     const MKLDNNDeviceContext& dev_ctx,
+                                     Args&&... args) {
+  auto p = std::static_pointer_cast<T, void>(dev_ctx.GetBlob(key));
+  if (!p) {
+    p = std::make_shared<T>(args...);
+    dev_ctx.SetBlob(key, std::static_pointer_cast<void, T>(p));
+  }
+  return p;
+}
+template <typename... Args>
+void run_primitive(Args&&... args) {
+  auto forward_op = mkldnn::lrn_forward{args...};
+  std::vector<mkldnn::primitive> pipeline = {forward_op};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+}  // namespace
+template <typename T>
+class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto x = ctx.Input<Tensor>("X");
+    auto out = ctx.Output<Tensor>("Out");
+    auto mid = ctx.Output<Tensor>("MidOut");
+    auto input_data = x->data<T>();
+    auto output_data = out->mutable_data<T>(ctx.GetPlace());
+    mid->mutable_data<T>(ctx.GetPlace());
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+    auto dims = paddle::framework::vectorize2int(x->dims());
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
+                                                  mkldnn::lrn_across_channels,
+                                                  src_md,
+                                                  n,
+                                                  alpha,
+                                                  beta,
+                                                  k};
+    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
+                                     static_cast<void*>(output_data)};
+    if (!is_test) {
+      const std::string key = ctx.op().Output("Out");
+      const std::string key_src_memory = key + "@lrn_src_memory";
+      const std::string key_pd = key + "@lrn_pd";
+      const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+      auto forward_pd = insert_to_context<mkldnn::lrn_forward::primitive_desc>(
+          key_pd, dev_ctx, forward_desc, mkldnn_engine);
+      auto src_memory = insert_to_context<mkldnn::memory>(
+          key_src_memory, dev_ctx, src_memory_pd);
+      src_memory->set_data_handle(
+          static_cast<void*>(const_cast<T*>(input_data)));
+      auto workspace_memory = insert_to_context<mkldnn::memory>(
+          key_workspace_memory, dev_ctx,
+          forward_pd->workspace_primitive_desc());
+      run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
+    } else {
+      auto forward_pd =
+          mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
+      auto src_memory = mkldnn::memory{
+          src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
+      auto workspace_memory =
+          mkldnn::memory{forward_pd.workspace_primitive_desc()};
+      run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
+    }
+  }
+};
+template <typename T>
+class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
+    auto x = ctx.Input<Tensor>("X");
+    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_src_memory = key + "@lrn_src_memory";
+    const std::string key_pd = key + "@lrn_pd";
+    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+    auto out_grad_data = out_grad->data<T>();
+    auto dims = paddle::framework::vectorize2int(x->dims());
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto diff_src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto diff_dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto diff_dst_memory =
+        mkldnn::memory{{diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<float*>(out_grad_data))};
+    auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
+                                          static_cast<void*>(x_grad_data)};
+    auto backward_desc = mkldnn::lrn_backward::desc{
+        mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k};
+    auto forward_pd = dev_ctx.GetBlob(key_pd);
+    auto backward_pd = mkldnn::lrn_backward::primitive_desc{
+        backward_desc, mkldnn_engine,
+        *static_cast<mkldnn::lrn_forward::primitive_desc*>(forward_pd.get())};
+    std::shared_ptr<void> workspace_memory =
+        dev_ctx.GetBlob(key_workspace_memory);
+    auto src_memory = dev_ctx.GetBlob(key_src_memory);
+    auto backward_op = mkldnn::lrn_backward{
+        backward_pd, *static_cast<mkldnn::memory*>(src_memory.get()),
+        diff_dst_memory, *static_cast<mkldnn::memory*>(workspace_memory.get()),
+        diff_src_memory};
+    std::vector<mkldnn::primitive> pipeline = {backward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lrn_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -116,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
 template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
 template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
+namespace {
+framework::OpKernelType GetExpectedLRNKernel(
+    const framework::ExecutionContext& ctx) {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+  }
+#endif
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+}  // namespace
 class LRNOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -132,8 +155,13 @@ class LRNOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
    ctx->SetOutputDim("Out", x_dim);
-    ctx->SetOutputDim("MidOut", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("MidOut", x_dim);
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
  }
 };
@@ -176,6 +204,17 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
               "beta is the power number.")
        .SetDefault(0.75)
        .GreaterThan(0.0);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddAttr<bool>("is_test", "").SetDefault(false);
    AddComment(R"DOC(
 Local Response Normalization Operator.
@@ -223,8 +262,12 @@ class LRNOpGrad : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
  }
-};
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
+  }
+};
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,6 +6,7 @@ function(math_library TARGET)
    # But it handle split GPU/CPU code and link some common library.
    set(cc_srcs)
    set(cu_srcs)
+    set(hip_srcs)
    set(math_common_deps device_context framework_proto)
    set(multiValueArgs DEPS)
    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
@@ -17,10 +18,15 @@ function(math_library TARGET)
    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
        list(APPEND cu_srcs ${TARGET}.cu)
    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+        list(APPEND hip_srcs ${TARGET}.hip.cu)
+    endif()
    list(LENGTH cc_srcs cc_srcs_len)
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
    elseif(${cc_srcs_len} GREATER 0)
        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
    endif()

--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -20,7 +20,7 @@ namespace math {
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CPUDeviceContext, T> {
@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }
 template <typename T>
-__global__ void KernelConcat(T** inputs, const int input_col,
+__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
-                             const int output_rows, const int output_cols,
+                             const int out_rows, const int out_cols,
-                             T* output) {
+                             T* output_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
-    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * fixed_in_col;
-    int in_offset = tid_x - split * input_col;
+    T* input_ptr = inputs_data[split];
-    T* input_ptr = inputs[split];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
-      output[tid_y * output_cols + tid_x] =
+      output_data[tid_y * out_cols + tid_x] =
-          input_ptr[tid_y * input_col + in_offset];
+          input_ptr[tid_y * fixed_in_col + in_offset];
    }
  }
 }
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
-                                 const int input_col, const int* output_cols,
+                                 const int in_col, const int* out_cols,
-                                 int col_size, T** outputs) {
+                                 int out_cols_size, T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
+  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
-  int curr_offset = output_cols[segment];
+  int curr_offset = out_cols[segment];
  int curr_segment = segment;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
    T curr_col_offset;
-    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
    }
    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs[curr_segment];
+    T* output_ptr = outputs_data[curr_segment];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
      output_ptr[tid_y * segment_width + local_col] =
-          input[tid_y * input_col + tid_x];
+          input_data[tid_y * in_col + tid_x];
  }
 }
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
-                                 const int input_col, const int output_cols,
+                                 const int in_col, const int fixed_out_col,
-                                 T** outputs) {
+                                 T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
-    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * fixed_out_col;
-    int in_offset = tid_x - split * input_col;
+    T* output_ptr = outputs_data[split];
-    T* output_ptr = outputs[split];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * output_cols + in_offset] =
+      output_ptr[tid_y * fixed_out_col + in_offset] =
-          input[tid_y * input_col + tid_x];
+          input_data[tid_y * in_col + tid_x];
  }
 }
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CUDADeviceContext, T> {
@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
                  const std::vector<framework::Tensor>& input, const int axis,
                  framework::Tensor* output) {
    // TODO(zcd): Add input data validity checking
-    int num = input.size();
+    int in_num = input.size();
-    int rows = 1;
+    int in_row = 1;
    auto dim_0 = input[0].dims();
    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
+      in_row *= dim_0[i];
    }
-    int cols = input[0].numel() / rows;
+    int in_col = input[0].numel() / in_row;
-    int out_rows = rows, out_cols = 0;
+    int out_row = in_row, out_col = 0;
-    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_cols(num + 1);
+    framework::Vector<int> inputs_col(in_num + 1);
-    inputs_cols[0] = 0;
    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+    inputs_col[0] = 0;
    bool sameShape = true;
-    for (int i = 0; i < num; ++i) {
+    for (int i = 0; i < in_num; ++i) {
-      int t_cols = input[i].numel() / rows;
+      int t_cols = input[i].numel() / in_row;
      if (sameShape) {
-        if (t_cols != cols) sameShape = false;
+        if (t_cols != in_col) sameShape = false;
      }
-      out_cols += t_cols;
+      out_col += t_cols;
-      inputs_cols[i + 1] = out_cols;
+      inputs_col[i + 1] = out_col;
      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
    }
-    T** ins_gpu =
+    T** dev_ins_data =
        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
    // computation
    // set the thread block and grid according to CurrentDeviceId
    const int kThreadsPerBlock = 1024;
    int block_cols = kThreadsPerBlock;
-    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    if (out_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((out_cols + 31) >> 5) << 5;
+      block_cols = ((out_col + 31) >> 5) << 5;
    }
    int block_rows = kThreadsPerBlock / block_cols;
    dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
    int grid_cols =
-        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+        std::min((out_col + block_cols - 1) / block_cols, max_blocks);
    int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
    if (sameShape) {
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+          dev_ins_data, in_col, out_row, out_col, output->data<T>());
    } else {
+      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
+          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
-          out_cols, output->data<T>());
+          out_row, out_col, output->data<T>());
    }
  }
 };
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& input, const int axis,
                  std::vector<framework::Tensor>& outputs) {
    // TODO(zcd): Add input data validity checking
-    int num = outputs.size();
+    int o_num = outputs.size();
-    int input_row = 1;
+    int out_row = 1;
    auto dim_0 = outputs[0].dims();
    for (int i = 0; i < axis; ++i) {
-      input_row *= dim_0[i];
+      out_row *= dim_0[i];
    }
-    int output_col_0 = outputs[0].numel() / input_row;
+    int out_col = outputs[0].numel() / out_row;
-    int input_col = 0;
+    int in_col = 0, in_row = out_row;
    bool sameShape = true;
-    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(num + 1);
+    framework::Vector<int> outputs_cols(o_num + 1);
-    outputs_cols[0] = 0;
    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
-    for (int i = 0; i < num; ++i) {
+    outputs_cols[0] = 0;
-      int t_col = outputs[i].numel() / input_row;
+    for (int i = 0; i < o_num; ++i) {
+      int t_col = outputs[i].numel() / out_row;
      if (sameShape) {
-        if (t_col != output_col_0) sameShape = false;
+        if (t_col != out_col) sameShape = false;
      }
-      input_col += t_col;
+      in_col += t_col;
-      outputs_cols[i + 1] = input_col;
+      outputs_cols[i + 1] = in_col;
      outputs_ptr[i] = outputs[i].data<T>();
    }
-    T** outs_gpu =
+    T** dev_out_gpu_data =
        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
    // computation
    const int kThreadsPerBlock = 1024;
    int block_cols = kThreadsPerBlock;
-    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    if (in_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((input_col + 31) >> 5) << 5;
+      block_cols = ((in_col + 31) >> 5) << 5;
    }
    int block_rows = kThreadsPerBlock / block_cols;
    dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
    int grid_cols =
-        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+        std::min((in_col + block_cols - 1) / block_cols, max_blocks);
    int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
    if (sameShape) {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
    } else {
+      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, outs_col_gpu,
+          input.data<T>(), in_row, in_col, dev_outs_col_data,
-          static_cast<int>(outputs_cols.size()), outs_gpu);
+          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
    }
  }
 };

--- a/paddle/fluid/operators/math/concat.hip.cu
+++ b/paddle/fluid/operators/math/concat.hip.cu
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <hip/hip_runtime.h>
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -19,13 +19,6 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
-#ifdef PADDLE_USE_ATLAS
-extern "C" {
-#include <cblas.h>
-#include <clapack.h>
-}
-#endif
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
 #include <lapacke.h>

--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
+                   "Cannot get input tensor X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
+                   "Cannot find output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+namespace {
+framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel& oper) {
+  framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+  }
+#endif
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      ctx.GetPlace(), layout, library);
+}
+}  // anonymous namespace
+class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase {
      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
                     "Only support parameter type as LoDTensor");
      auto &src = scope.FindVar(param)->Get<LoDTensor>();
-      for (size_t i = 0; i < sub_scopes.size(); ++i) {
+      auto *sub_scope0 = sub_scopes[0];
+      auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
+      dst0->ShareDataWith(src);
+      for (size_t i = 1; i < sub_scopes.size(); ++i) {
        auto &place = places[i];
        auto *sub_scope = sub_scopes[i];
        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -166,7 +166,9 @@ void DoubleBufferReader::PrefetchThreadFunc() {
      std::swap(gpu_batch, batch.payloads_);
    }
-    if (!buffer_->Send(&batch)) {
+    try {
+      buffer_->Send(&batch);
+    } catch (paddle::platform::EnforceNotMet e) {
      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                 "prefetch thread will terminate.";
      break;

--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name,
  while (reader->HasNext()) {
    std::vector<framework::LoDTensor> ins;
    reader->ReadNext(&ins);
-    if (!buffer_->Send(&ins)) {
+    try {
+      buffer_->Send(&ins);
+    } catch (paddle::platform::EnforceNotMet e) {
      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
                 "thread of file '"
              << file_name << "' will terminate.";
      break;
    }
  }
-  if (!available_thread_idx_->Send(&thread_idx)) {
+  try {
+    available_thread_idx_->Send(&thread_idx);
+  } catch (paddle::platform::EnforceNotMet e) {
    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
               "Fail to send thread_idx.";
  }

--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
@@ -27,6 +27,7 @@ namespace operators {
 static constexpr char kX[] = "X";
 static constexpr char kCaseToExecute[] = "case_to_execute";
+static constexpr char kOutputs[] = "Out";
 static constexpr char kCases[] = "cases";
 static constexpr char kCasesBlock[] = "sub_block";
@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
             "(Int) The variable the sets the index of the case to execute, "
             "after evaluating the channels being sent to and received from")
        .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the cases of Select Op.")
+        .AsDuplicable();
    AddAttr<std::vector<std::string>>(kCases,
                                      "(String vector) Serialized list of"
                                      "all cases in the select op. Each"

--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <ostream>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+namespace paddle {
+namespace operators {
+class SendBarrierOp : public framework::OperatorBase {
+ public:
+  SendBarrierOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    // need to wait before sending send_barrier message
+    PADDLE_ENFORCE(rpc_client->Wait());
+    for (auto& ep : eps) {
+      VLOG(3) << "send barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+  }
+};
+class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
+    AddComment(R"DOC(
+SendBarrier operator
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+class SendBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
+                  ops::SendBarrierOpVarTypeInference,
+                  ops::SendBarrierOpShapeInference);
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
    auto client_var_name = Output("RPCClient");
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
                            "Can not find variable '%s' in the scope.",
@@ -68,7 +72,7 @@ class SendOp : public framework::OperatorBase {
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
@@ -77,20 +81,20 @@ class SendOp : public framework::OperatorBase {
    PADDLE_ENFORCE(rpc_client->Wait());
    for (auto& ep : endpoints) {
-      VLOG(3) << "batch barrier, ep: " << ep;
+      VLOG(2) << "batch barrier, ep: " << ep;
      rpc_client->AsyncSendBatchBarrier(ep);
    }
    PADDLE_ENFORCE(rpc_client->Wait());
    if (outs.size() > 0) {
      for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
      }
      PADDLE_ENFORCE(rpc_client->Wait());
      // tell pservers that current trainer have called fetch
      for (auto& ep : endpoints) {
-        VLOG(3) << "send fetch barrier, ep: " << ep;
+        VLOG(2) << "send fetch barrier, ep: " << ep;
        rpc_client->AsyncSendFetchBarrier(ep);
      }
      PADDLE_ENFORCE(rpc_client->Wait());

--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <ostream>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+namespace paddle {
+namespace operators {
+static bool NeedSend(const framework::Scope& scope,
+                     const std::string& varname) {
+  auto* var = scope.FindVar(varname);
+  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
+                          varname);
+  if (var->IsType<framework::LoDTensor>()) {
+    return var->Get<framework::LoDTensor>().IsInitialized();
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
+  } else {
+    PADDLE_THROW(
+        "Variable type in send side should be in "
+        "[LodTensor, SelectedRows]");
+  }
+  return false;
+}
+class SendVarsOp : public framework::OperatorBase {
+ public:
+  SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_send = Attr<int>("sync_sent");
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    for (size_t i = 0; i < ins.size(); i++) {
+      if (NeedSend(scope, ins[i])) {
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        // TODO(Yancey1989): we need to use an IO threadpool which has
+        // a larger number of threads than the computing threadpool.
+        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      } else {
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+      }
+    }
+    if (sync_send) {
+      rpc_client->Wait();
+    }
+  }
+};
+class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
+        .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which will be"
+              "initialized at most once.");
+    AddComment(R"DOC(
+Send operator
+This operator will send variables to listen_and_serve op at the parameter server.
+)DOC");
+    AddAttr<int>("ync_send",
+                 "(int, default 0)"
+                 "sync send or async send.")
+        .SetDefault(0);
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+class SendVarsOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+class SendVarsOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
+                  paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
+                  ops::SendVarsOpVarTypeInference,
+                  ops::SendVarsOpShapeInference);
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/split_ids_op.h"
+namespace paddle {
+namespace operators {
+class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitIdsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
+Example:
+  Input:
+    X = [1,2,3,4,5,6]
+  Out(3 output):
+    out0 = [3, 6]
+    out1 = [1, 4]
+    out2 = [2, 5]
+)DOC");
+  }
+};
+class SplitIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
+    auto ids_dims = ctx->GetInputDim("Ids");
+    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+  }
+};
+class SplitIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
+                  ops::SplitIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SplitIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("SplitIds do not support GPU kernel");
+    }
+    const auto* ids_t = ctx.Input<framework::LoDTensor>("Ids");
+    auto& ids_dims = ids_t->dims();
+    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+    const T* ids = ids_t->data<T>();
+    const size_t shard_num = outs.size();
+    std::vector<std::vector<T>> out_ids;
+    out_ids.resize(outs.size());
+    // split id by their shard_num.
+    for (size_t i = 0; i < ids_dims[0]; ++i) {
+      T id = ids[i];
+      size_t shard_id = static_cast<size_t>(id) % shard_num;
+      out_ids[shard_id].push_back(id);
+    }
+    // create tensor for each shard and send to parameter server
+    for (size_t i = 0; i < out_ids.size(); ++i) {
+      auto* shard_t = outs[i];
+      std::vector<T> ids = out_ids[i];
+      auto* shard_data = shard_t->mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      for (size_t i = 0; i < ids.size(); ++i) {
+        shard_data[i] = ids[i];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
-cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
 if(WITH_PYTHON)
-  cc_library(paddle_pybind SHARED
+  if(WITH_AMD_GPU)
-    SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+    hip_library(paddle_pybind SHARED
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-    ${GLOB_OP_LIB})
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
-  if(NOT APPLE AND NOT ANDROID)
+      ${GLOB_OP_LIB})
-    target_link_libraries(paddle_pybind rt)
+  else()
-  endif(NOT APPLE AND NOT ANDROID)
+    cc_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      ${GLOB_OP_LIB})
+    if(NOT APPLE AND NOT ANDROID)
+      target_link_libraries(paddle_pybind rt)
+    endif(NOT APPLE AND NOT ANDROID)
+  endif(WITH_AMD_GPU)
 endif(WITH_PYTHON)
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr;
  } __name;  // struct DynLoad__##__name
 #endif
-#ifdef PADDLE_USE_ATLAS
+#define  PADDLE_SGETRF  LAPACKE_sgetrf
-  #define  PADDLE_SGETRF  clapack_sgetrf
+#define  PADDLE_DGETRF  LAPACKE_dgetrf
-  #define  PADDLE_DGETRF  clapack_dgetrf
+#define  PADDLE_SGETRI  LAPACKE_sgetri
-  #define  PADDLE_SGETRI  clapack_sgetri
+#define  PADDLE_DGETRI  LAPACKE_dgetri
-  #define  PADDLE_DGETRI  clapack_dgetri
-#else
-  #define  PADDLE_SGETRF  LAPACKE_sgetrf
-  #define  PADDLE_DGETRF  LAPACKE_dgetrf
-  #define  PADDLE_SGETRI  LAPACKE_sgetri
-  #define  PADDLE_DGETRI  LAPACKE_dgetri
-#endif
 #define LAPACK_ROUTINE_EACH(__macro)       \
  __macro(PADDLE_SGETRF)                   \

--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
-#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
+#if defined(PADDLE_USE_VECLIB)
 extern "C" {
 #include <cblas.h>
 #include <clapack.h>

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -37,6 +37,7 @@ function cmake_gen() {
        -DWITH_DSO=ON
        -DWITH_DOC=${WITH_DOC:-OFF}
        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
        -DWITH_MKL=${WITH_MKL:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
@@ -50,6 +51,7 @@ function cmake_gen() {
        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
        -DWITH_TESTING=${WITH_TESTING:-ON}
        -DWITH_FAST_BUNDLE_TEST=ON
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
    ========================================
 EOF
@@ -62,6 +64,7 @@ EOF
        -DWITH_DSO=ON \
        -DWITH_DOC=${WITH_DOC:-OFF} \
        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
        -DWITH_MKL=${WITH_MKL:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -74,6 +77,7 @@ EOF
        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
        -DWITH_TESTING=${WITH_TESTING:-ON} \
        -DWITH_FAST_BUNDLE_TEST=ON \
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from layers.control_flow import BlockGuard, Select
+from layers.control_flow import BlockGuard, equal
+from .framework import Operator
 from layer_helper import LayerHelper, unique_name
 from layers import fill_constant
 import core
@@ -75,6 +76,206 @@ class Go(BlockGuard):
            attrs={'sub_block': go_block})
+class SelectCase(object):
+    DEFAULT = 0
+    SEND = 1
+    RECEIVE = 2
+    def __init__(self,
+                 select,
+                 case_idx,
+                 case_to_execute,
+                 channel_action_fn=None,
+                 channel=None,
+                 value=None,
+                 is_copy=False):
+        self.select = select
+        self.helper = LayerHelper('conditional_block')
+        self.main_program = self.helper.main_program
+        self.is_scalar_condition = True
+        self.case_to_execute = case_to_execute
+        self.idx = case_idx
+        # Since we aren't going to use the `channel_send` or `channel_recv`
+        # functions directly, we just need to capture the name.
+        self.action = (self.SEND
+                       if channel_action_fn.__name__ == ('channel_send') else
+                       self.RECEIVE) if channel_action_fn else self.DEFAULT
+        X = value
+        if self.action == self.SEND and is_copy:
+            # We create of copy of the data we want to send
+            copied_X = self.select.parent_block.create_var(
+                name=unique_name.generate(value.name + '_copy'),
+                type=value.type,
+                dtype=value.dtype,
+                shape=value.shape,
+                lod_level=value.lod_level,
+                capacity=value.capacity
+                if hasattr(value, 'capacity') else None, )
+            self.select.parent_block.append_op(
+                type="assign", inputs={"X": value}, outputs={"Out": copied_X})
+            X = copied_X
+        self.value = X
+        self.channel = channel
+    def __enter__(self):
+        self.block = self.main_program.create_block()
+    def construct_op(self):
+        main_program = self.helper.main_program
+        cases_block = main_program.current_block()
+        inner_outputs = set()
+        input_set = set()
+        params = set()
+        for op in self.block.ops:
+            # Iterate over all operators, get all the inputs
+            # and add as input to the SelectCase operator.
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        input_set.add(in_var_name)
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+        param_list = [
+            cases_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+        # Iterate over all operators, get all the outputs
+        # add to the output list of SelectCase operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in cases_block.vars:
+                out_vars.append(cases_block.var(inner_out_name))
+        # First, create an op that will determine whether or not this is the
+        # conditional variable to execute.
+        should_execute_block = equal(
+            fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
+            self.case_to_execute)
+        step_scope = cases_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        cases_block.append_op(
+            type='conditional_block',
+            inputs={'X': [should_execute_block],
+                    'Params': param_list},
+            outputs={'Out': out_vars,
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': self.block,
+                'is_scalar_condition': self.is_scalar_condition
+            })
+        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
+                                if self.channel else '', self.value.name
+                                if self.value else '')
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+class Select(BlockGuard):
+    def __init__(self, name=None):
+        self.helper = LayerHelper('select', name=name)
+        self.parent_block = self.helper.main_program.current_block()
+        self.cases = []
+        super(Select, self).__init__(self.helper.main_program)
+        self.case_to_execute = fill_constant(
+            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
+    def __enter__(self):
+        super(Select, self).__enter__()
+        return self
+    def case(self, channel_action_fn, channel, value, is_copy=False):
+        """Create a new block for this condition.
+        """
+        select_case = SelectCase(self,
+                                 len(self.cases), self.case_to_execute,
+                                 channel_action_fn, channel, value, is_copy)
+        self.cases.append(select_case)
+        return select_case
+    def default(self):
+        """Create a default case block for this condition.
+        """
+        default_case = SelectCase(self, len(self.cases), self.case_to_execute)
+        self.cases.append(default_case)
+        return default_case
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        # Create a select op and another block to wrap its
+        # case blocks.
+        select_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(select_block.parent_idx)
+        # Construct each case op, inside the newly created select block.
+        serialized_cases = []
+        for case in self.cases:
+            serialized_cases.append(case.construct_op())
+        intermediate = set()
+        params = set()
+        for case_block in select_block.ops:
+            if case_block.attrs and 'sub_block' in case_block.attrs:
+                for each_op in case_block.attrs['sub_block'].ops:
+                    assert isinstance(each_op, Operator)
+                    for iname in each_op.input_names:
+                        for in_var_name in each_op.input(iname):
+                            if in_var_name not in intermediate:
+                                params.add(in_var_name)
+                    for oname in each_op.output_names:
+                        for out_var_name in each_op.output(oname):
+                            intermediate.add(out_var_name)
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name in intermediate
+        ]
+        X = [select_block.var_recursive(x_name) for x_name in params]
+        # Needs to be used by `equal` inside the cases block.
+        X.append(self.case_to_execute)
+        # Construct the select op.
+        parent_block.append_op(
+            type='select',
+            inputs={'X': X,
+                    'case_to_execute': self.case_to_execute},
+            attrs={'sub_block': select_block,
+                   'cases': serialized_cases},
+            outputs={'Out': out_list})
+        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
 def make_channel(dtype, capacity=0):
    """
    Helps implementation of a concurrent program by creating a "channel" of
@@ -159,35 +360,26 @@ def channel_send(channel, value, is_copy=False):
    main_program = helper.main_program
    channel_send_block = main_program.current_block()
-    status = helper.create_variable(
-        name=unique_name.generate('status'),
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        dtype=core.VarDesc.VarType.BOOL)
    X = value
-    if is_copy is True:
+    if is_copy:
        copied_X = helper.create_variable(
            name=unique_name.generate(value.name + '_copy'),
            type=value.type,
            dtype=value.dtype,
            shape=value.shape,
            lod_level=value.lod_level,
-            capacity=value.capacity)
+            capacity=value.capacity if hasattr(value, 'capacity') else None)
        assign_op = channel_send_block.append_op(
-            type="assign_op", inputs={"X": value}, outputs={"Out": copied_X})
+            type="assign", inputs={"X": value}, outputs={"Out": copied_X})
        X = copied_X
-    channel_send_op = channel_send_block.append_op(
+    channel_send_block.append_op(
-        type="channel_send",
+        type="channel_send", inputs={
-        inputs={
            "Channel": channel,
            "X": X,
-        },
+        })
-        outputs={"Status": status})
-    return status
 def channel_recv(channel, return_value):

--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -565,6 +565,8 @@ class DistributeTranspiler:
        orig_var_name = ""
        if suff_idx >= 0:
            orig_var_name = varname[:suff_idx]
+        else:
+            orig_var_name = varname
        return orig_var_name
    def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
@@ -579,7 +581,8 @@ class DistributeTranspiler:
                grad_block = None
                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
                    if same_or_split_var(
-                            self._orig_varname(g.name), opt_op.input(key)[0]):
+                            self._orig_varname(g.name),
+                            self._orig_varname(opt_op.input(key)[0])):
                        grad_block = g
                        break
                if not grad_block:
@@ -750,7 +753,7 @@ class DistributeTranspiler:
        param_names = [
            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
        ]
-        if op.input("Param") in param_names:
+        if op.input("Param")[0] in param_names:
            return True
        else:
            for n in param_names:

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -403,6 +403,8 @@ class LayerHelper(object):
        if 'use_mkldnn' in self.kwargs:
            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        act_type = act.pop('type')
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        self.append_op(
            type=act_type,
            inputs={"X": [input_var]},

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -16,7 +16,7 @@ import contextlib
 from layer_function_generator import autodoc
 from tensor import assign, fill_constant
 from .. import core
-from ..framework import Program, Variable, Operator, Block
+from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ops import logical_and, logical_not, logical_or
@@ -29,7 +29,6 @@ __all__ = [
    'WhileGuard',
    'While',
    'Switch',
-    'Select',
    'lod_rank_table',
    'max_sequence_len',
    'topk',
@@ -1212,186 +1211,6 @@ class Switch(object):
        return True
-class SelectCase(object):
-    DEFAULT = 0
-    SEND = 1
-    RECEIVE = 2
-    def __init__(self,
-                 case_idx,
-                 case_to_execute,
-                 channel_action_fn=None,
-                 channel=None,
-                 value=None):
-        self.helper = LayerHelper('conditional_block')
-        self.main_program = self.helper.main_program
-        self.is_scalar_condition = True
-        self.case_to_execute = case_to_execute
-        self.idx = case_idx
-        # Since we aren't going to use the `channel_send` or `channel_recv`
-        # functions directly, we just need to capture the name.
-        self.action = (self.SEND
-                       if channel_action_fn.__name__ == ('channel_send') else
-                       self.RECEIVE) if channel_action_fn else (self.DEFAULT)
-        self.value = value
-        self.channel = channel
-    def __enter__(self):
-        self.block = self.main_program.create_block()
-    def construct_op(self):
-        main_program = self.helper.main_program
-        cases_block = main_program.current_block()
-        inner_outputs = set()
-        input_set = set()
-        params = set()
-        for op in self.block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the SelectCase operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        input_set.add(in_var_name)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-        param_list = [
-            cases_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-        # Iterate over all operators, get all the outputs
-        # add to the output list of SelectCase operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in cases_block.vars:
-                out_vars.append(cases_block.var(inner_out_name))
-        # First, create an op that will determine whether or not this is the
-        # conditional variable to execute.
-        should_execute_block = equal(
-            fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
-            self.case_to_execute)
-        step_scope = cases_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-        cases_block.append_op(
-            type='conditional_block',
-            inputs={'X': [should_execute_block],
-                    'Params': param_list},
-            outputs={'Out': out_vars,
-                     'Scope': [step_scope]},
-            attrs={
-                'sub_block': self.block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
-        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
-                                if self.channel else '', self.value.name
-                                if self.value else '')
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-class Select(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper('select', name=name)
-        self.cases = []
-        super(Select, self).__init__(self.helper.main_program)
-        self.case_to_execute = fill_constant(
-            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
-    def __enter__(self):
-        super(Select, self).__enter__()
-        return self
-    def case(self, channel_action_fn, channel, value):
-        """Create a new block for this condition.
-        """
-        select_case = SelectCase(
-            len(self.cases), self.case_to_execute, channel_action_fn, channel,
-            value)
-        self.cases.append(select_case)
-        return select_case
-    def default(self):
-        """Create a default case block for this condition.
-        """
-        default_case = SelectCase(len(self.cases), self.case_to_execute)
-        self.cases.append(default_case)
-        return default_case
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        # Create a select op and another block to wrap its
-        # case blocks.
-        select_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(select_block.parent_idx)
-        # Construct each case op, inside the newly created select block.
-        serialized_cases = []
-        for case in self.cases:
-            serialized_cases.append(case.construct_op())
-        intermediate = set()
-        params = set()
-        for case_block in select_block.ops:
-            if case_block.attrs and 'sub_block' in case_block.attrs:
-                for each_op in case_block.attrs['sub_block'].ops:
-                    assert isinstance(each_op, Operator)
-                    for iname in each_op.input_names:
-                        for in_var_name in each_op.input(iname):
-                            if in_var_name not in intermediate:
-                                params.add(in_var_name)
-                    for oname in each_op.output_names:
-                        for out_var_name in each_op.output(oname):
-                            intermediate.add(out_var_name)
-        # TODO(varunarora): Figure out if defining output is needed.
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
-        X = [select_block.var_recursive(x_name) for x_name in params]
-        # Needs to be used by `equal` inside the cases block.
-        X.append(self.case_to_execute)
-        # Construct the select op.
-        parent_block.append_op(
-            type='select',
-            inputs={'X': X,
-                    'case_to_execute': self.case_to_execute},
-            attrs={'sub_block': select_block,
-                   'cases': serialized_cases},
-            outputs={})
-        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
 class IfElseBlockGuard(object):
    def __init__(self, is_true, ifelse):
        if not isinstance(ifelse, IfElse):

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -133,6 +133,7 @@ def detection_output(loc,
    scores = nn.softmax(input=scores)
    scores = nn.reshape(x=scores, shape=old_shape)
    scores = nn.transpose(scores, perm=[0, 2, 1])
+    scores.stop_gradient = True
    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
    helper.append_op(
        type="multiclass_nms",
@@ -147,6 +148,7 @@ def detection_output(loc,
            'score_threshold': score_threshold,
            'nms_eta': 1.0
        })
+    nmsed_outs.stop_gradient = True
    return nmsed_outs
@@ -836,4 +838,6 @@ def multi_box_head(inputs,
        mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
        mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
+    box.stop_gradient = True
+    var.stop_gradient = True
    return mbox_locs_concat, mbox_confs_concat, box, var
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -113,9 +113,9 @@ class ListenAndServ(object):
    which can receive variables from clients and run a block.
    """
-    def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
+    def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
        self.helper = LayerHelper("listen_and_serv")
-        self.inputs = []
+        self.inputs = inputs
        self.outputs = []
        self.endpoint = endpoint
        self.fan_in = fan_in
@@ -160,18 +160,13 @@ class ListenAndServ(object):
        current_block = main_program.current_block()
        parent_block = self.parent_block()
-        params, grads = self.get_params_and_grads()
-        param_names = [p.name for p in params]
-        grad_names = [g.name for g in grads]
        parent_block.append_op(
            type='listen_and_serv',
-            inputs={},
+            inputs={"X": self.inputs},
            outputs={},
            attrs={
                'endpoint': self.endpoint,
                'Fanin': self.fan_in,
-                'ParamList': param_names,
-                'GradList': grad_names,
                'OptimizeBlock': current_block
            })
@@ -196,10 +191,14 @@ def Send(endpoints, send_vars, get_vars):
    endpoints = list(set(epmap))
    helper = LayerHelper("Send", **locals())
+    rpc_client_var = default_main_program().global_block().create_var(
+        name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
    helper.append_op(
        type="send",
        inputs={"X": send_vars},
-        outputs={"Out": get_vars},
+        outputs={"Out": get_vars,
+                 "RPCClient": rpc_client_var},
        attrs={"endpoints": endpoints,
               "epmap": epmap})

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -75,6 +75,7 @@ __all__ = [
    'autoincreased_step_counter',
    'reshape',
    'lod_reset',
+    'lrn',
 ]
@@ -3508,3 +3509,73 @@ def lod_reset(x, y=None, target_lod=None):
        raise ValueError("y and target_lod should not be both None.")
    return out
+def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
+    """
+    Local Response Normalization Layer. This layer performs a type of
+    "lateral inhibition" by normalizing over local input regions.
+    The formula is as follows:
+    .. math::
+        Output(i, x, y) = Input(i, x, y) / \left(
+        k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
+        (Input(j, x, y))^2 \right)^{\beta}
+    In the above equation:
+    * :math:`n`: The number of channels to sum over.
+    * :math:`k`: The offset (avoid being divided by 0).
+    * :math:`alpha`: The scaling parameter.
+    * :math:`beta`: The exponent parameter.
+    Refer to `ImageNet Classification with Deep Convolutional Neural Networks
+    <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
+    Args:
+        input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4.
+        n (int, default 5): The number of channels to sum over.
+        k (float, default 1.0): An offset (usually positive to avoid dividing by 0).
+        alpha (float, default 1e-4): The scaling parameter.
+        beta (float, default 0.75): The exponent.
+        name (str, default None): A name for this operation.
+    Raises:
+        ValueError: If rank of the input tensor is not 4.
+    Returns:
+        A tensor variable storing the transformation result.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
+          lrn = fluid.layers.lrn(input=data)
+    """
+    helper = LayerHelper('lrn', **locals())
+    dtype = helper.input_dtype()
+    input_shape = input.shape
+    dims = len(input_shape)
+    if dims != 4:
+        raise ValueError(
+            "dims of input must be 4(not %d), and it's order must be NCHW" %
+            (dims))
+    mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    lrn_out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="lrn",
+        inputs={"X": input},
+        outputs={
+            "Out": lrn_out,
+            "MidOut": mid_out,
+        },
+        attrs={"n": n,
+               "k": k,
+               "alpha": alpha,
+               "beta": beta})
+    return lrn_out
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase):
                with while_op.block():
                    result2 = fill_constant(
                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-                    x_to_send_tmp = fill_constant(
-                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-                    # TODO(abhinav): Need to perform copy when doing a channel send.
-                    #   Once this is complete, we can remove these lines
-                    assign(input=x, output=x_to_send_tmp)
                    with fluid.Select() as select:
-                        with select.case(fluid.channel_send, channel,
+                        with select.case(
-                                         x_to_send_tmp):
+                                fluid.channel_send, channel, x, is_copy=True):
                            assign(input=x, output=x_tmp)
                            assign(input=y, output=x)
                            assign(elementwise_add(x=x_tmp, y=y), output=y)
@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase):
                                              core.VarDesc.VarType.LOD_TENSOR,
                                              core.VarDesc.VarType.FP64)
-            pong_result = self._create_tensor('pong_return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.FP64)
            def ping(ch, message):
-                message_to_send_tmp = fill_constant(
+                fluid.channel_send(ch, message, is_copy=True)
-                    shape=[1], dtype=core.VarDesc.VarType.FP64, value=0)
-                assign(input=message, output=message_to_send_tmp)
-                fluid.channel_send(ch, message_to_send_tmp)
            def pong(ch1, ch2):
                fluid.channel_recv(ch1, ping_result)
-                assign(input=ping_result, output=pong_result)
+                fluid.channel_send(ch2, ping_result, is_copy=True)
-                fluid.channel_send(ch2, pong_result)
            pings = fluid.make_channel(
                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -506,5 +506,54 @@ class TestSwish(OpTest):
        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+#--------------------test MKLDNN--------------------
+class TestMKLDNNRelu(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNRelu, self).setUp()
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+class TestMKLDNNTanh(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanh, self).setUp()
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+class TestMKLDNNSqrt(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrt, self).setUp()
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+class TestMKLDNNAbs(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbs, self).setUp()
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -20,19 +20,35 @@ from op_test import OpTest
 class TestConcatOp(OpTest):
    def setUp(self):
        self.op_type = "concat"
-        x0 = np.random.random((2, 1, 4, 5)).astype('float32')
+        self.init_test_data()
-        x1 = np.random.random((2, 2, 4, 5)).astype('float32')
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
-        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.attrs = {'axis': self.axis}
-        axis = 1
+        self.outputs = {
-        self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]}
+            'Out': np.concatenate(
-        self.attrs = {'axis': axis}
+                (self.x0, self.x1, self.x2), axis=self.axis)
-        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+        }
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
+class TestConcatOp2(OpTest):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(layers.softmax(hid))
        print(str(program))
+    def test_lrn(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[6, 2, 2], dtype='float32')
+            self.assertIsNotNone(layers.lrn(data))
+        print(str(program))
    def test_get_places(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -87,5 +87,34 @@ class TestLRNOp(OpTest):
        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+    def get_attrs(self):
+        attrs = TestLRNMKLDNNOp.get_attrs(self)
+        attrs['is_test'] = True
+        return attrs
+    def test_check_grad_normal(self):
+        def check_raise_is_test():
+            try:
+                self.check_grad(['X'], 'Out', max_relative_error=0.01)
+            except Exception as e:
+                t = \
+                "is_test attribute should be set to False in training phase."
+                if t in str(e):
+                    raise AttributeError
+        self.assertRaises(AttributeError, check_raise_is_test)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -38,14 +38,15 @@ class TestRecvOp(unittest.TestCase):
    def init_serv(self, place):
        main = fluid.Program()
        with fluid.program_guard(main):
-            x = layers.data(
+            serv = layers.ListenAndServ(
-                shape=[32, 32],
+                "127.0.0.1:6174", ["X"], optimizer_mode=False)
-                dtype='float32',
-                name="X",
-                append_batch_size=False)
-            fluid.initializer.Constant(value=1.0)(x, main.global_block())
-            serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
            with serv.do():
+                x = layers.data(
+                    shape=[32, 32],
+                    dtype='float32',
+                    name="X",
+                    append_batch_size=False)
+                fluid.initializer.Constant(value=1.0)(x, main.global_block())
                o = layers.scale(x=x, scale=10.0)
            main.global_block().create_var(
                name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)

--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestSplitIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "split_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        out0 = np.array([[0], [3], [6]]).astype('int64')
+        out1 = np.array([[]]).astype('int64')
+        out2 = np.array([[2], [2], [5], [5]]).astype('int64')
+        self.inputs = {'Ids': ids}
+        self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
+    def test_check_output(self):
+        self.check_output()
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase):
    def test_lod_tensor_gpu_init(self):
        if not core.is_compiled_with_cuda():
            return
-        scope = core.Scope()
        place = core.CUDAPlace(0)
        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
        lod_tensor = core.LoDTensor()
@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase):
        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
        self.assertListEqual(lod_py, lod_tensor.lod())
+    def test_empty_tensor(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        var = scope.var("test_tensor")
+        tensor = var.get_tensor()
+        tensor.set_dims([0, 1])
+        tensor.alloc_float(place)
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((0, 1), tensor_array.shape)
+        if core.is_compiled_with_cuda():
+            gpu_place = core.CUDAPlace(0)
+            tensor.alloc_float(gpu_place)
+            tensor_array = numpy.array(tensor)
+            self.assertEqual((0, 1), tensor_array.shape)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation):
    .. math::
-       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} }
+       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
    """
    def __init__(self):

--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """
-__all__ = ['np_array', 'text_file', "cloud_reader"]
+__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
 def np_array(x):