Merge branch 'develop' into build_manually

4e363a41 · weixing · GitHub · 53a4a583 · cb3bbbd5 · 4e363a41
54 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ include(simd)

 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
@@ -180,6 +181,11 @@ if(WITH_GPU)
    include(cuda)
 endif(WITH_GPU)

+if(WITH_AMD_GPU)
+    find_package(HIP)
+    include(hip)
+endif(WITH_AMD_GPU)
+
 if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)

-if(NOT WITH_GPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
+if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)

    FIND_PACKAGE(CUDA REQUIRED)
@@ -84,7 +80,14 @@ else()
    # Include cuda and cudnn
    include_directories(${CUDNN_INCLUDE_DIR})
    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+elseif(WITH_AMD_GPU)
+    add_definitions(-DPADDLE_WITH_HIP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()

 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})

-ExternalProject_Add(
-    extern_eigen3
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
-    PREFIX          ${EIGEN_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
+if(WITH_AMD_GPU)
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+else()
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+endif()

 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME)
  endif()
 endfunction(nv_test)

+function(hip_library TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_SRCS)
+      if (hip_library_SHARED OR hip_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+	find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (hip_library_DEPS)
+	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_SRCS})
+	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
+    else(hip_library_SRCS)
+      if (hip_library_DEPS)
+	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+      else()
+	message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_SRCS)
+  endif()
+endfunction(hip_library)
+
+function(hip_binary TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+    if(hip_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+    endif()
+  endif()
+endfunction(hip_binary)
+
+function(hip_test TARGET_NAME)
+  if (WITH_AMD_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_test_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(hip_test)
+
 function(go_library TARGET_NAME)
  set(options STATIC static SHARED shared)
  set(oneValueArgs "")

--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
+if(NOT WITH_AMD_GPU)
+    return()
+endif()
+
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+
+if(WITH_DSO)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+
+if(WITH_TESTING)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -2,13 +2,14 @@
 如何贡献文档
 #############

-PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成，也可以利用paddlepaddle.org工具来编译和预览文档。
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。

 如何构建文档
 ============

 PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。

+我们建议使用PaddlePaddle.org工具来构建文档。

 使用PaddlePaddle.org工具
 ------------------------
@@ -28,7 +29,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest

 注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。

 如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
@@ -50,7 +51,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
    python manage.py runserver

 工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。

 想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
@@ -109,9 +110,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D

   python -m SimpleHTTPServer 8088

-在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面。
-
-下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。

 ..  image:: src/doc_en.png
    :align: center

--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -2,21 +2,20 @@
 Contribute Documentation
 ########################

-PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
-Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
-When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.

-How to Build Documentations
-============
+How to build Documentation
+===========================

-We recommend using PaddlePaddle.org tool to build documentation
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.

+We recommend using PaddlePaddle.org tool to build documentation.

-Use PaddlePaddle.org tool
--------------
-This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+Using PaddlePaddle.org tool
+-----------------------------
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.

-The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool

 ..  code-block:: bash

@@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi
    # Please specify the working directory through -v
    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest

-Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content


@@ -58,19 +57,62 @@ If you don't wish to use Docker, you can also activate the tool through Django.
    pip install -r requirements.txt
    python manage.py runserver

-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content

-If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
+
+
+Manually Building the Documentation
+-------------------------------------
+
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
+
+[TBD]
+
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+
+.. code-block:: bash
+
+   mkdir paddle
+   cd paddle
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # If you only need to build documents, use the following commands
+   make -j $processors gen_proto_py
+   make -j $processors paddle_docs paddle_docs_cn
+
+   # If you only need to build APIs, use the following commands
+   make -j $processors gen_proto_py framework_py_proto
+   make -j $processors copy_paddle_pybind
+   make -j $processors paddle_api_docs
+
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+
+After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs，it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.

-How to write Documentations
-============
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %

-PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+How to write Documentation
+===========================

+PaddlePaddle uses `sphinx`_ to compile documentation，Please check sphinx official website for more detail.

 How to update www.paddlepaddle.org
-============================
+===================================

 Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
 PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and

--- a/doc/v2/howto/capi/index_en.rst
+++ b/doc/v2/howto/capi/index_en.rst
-C-API Prediction Library
+C-API Inference Library
 ========================

+After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
+
+Compared with model training, prediction has the following features:
+
+#. Inference does not require backpropagation and parameter updates, as required during training.
+#. Labels are not needed in prediction.
+#. Most of the time, predictions need to be integrated with the user system.
+
+Therefore, the model prediction SDK needs to be designed separately and has the following features:
+
+#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
+#. The predictive SDK needs a simple user interface for ease of use.
+#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
+#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
+
+PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
+
 ..  toctree::
  :maxdepth: 1


--- a/doc/v2/howto/index_en.rst
+++ b/doc/v2/howto/index_en.rst
 HOW TO
-=======
+========
+
+PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:

 ..  toctree::
  :maxdepth: 1

  cmd_parameter/index_en.rst
+
+PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
+
+..  toctree::
+  :maxdepth: 1
+
  cluster/index_en.rst
+
+PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
  capi/index_en.rst
+
+PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
  rnn/index_en.rst
+
+How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
  optimization/gpu_profiling_en.rst
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
  // Select block
  AddOp("select", {{"X", {dataChanName, quitChanName}},
                   {"case_to_execute", {"caseToExecute"}}},
-        {}, {{"sub_block", casesBlock},
-             {"cases", std::vector<std::string>{case0Config, case1Config}}},
+        {{"Out", {}}},
+        {{"sub_block", casesBlock},
+         {"cases", std::vector<std::string>{case0Config, case1Config}}},
        whileBlock);

  scope->Var("stepScopes");
@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {

  executor.Run(program, &scope, 0, true, true);

-  // After we call executor.run, the Go operator should do a channel_send to set
-  // the
-  // "result" variable to 99
+  // After we call executor.run, the Go operator should do a channel_send to
+  // set the "result" variable to 99.
  auto *finalData = tensor.data<int>();
  EXPECT_EQ(finalData[0], 99);
 }

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -176,7 +176,7 @@ class Vector {

  // resize the vector
  void resize(size_t size) {
-    if (size + 1 < capacity()) {
+    if (size + 1 <= capacity()) {
      size_ = size;
    } else {
      MutableCPU();

--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -104,3 +104,11 @@ TEST(mixed_vector, ForEach) {
  for (auto& v : tmp) {
  }
 }
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -117,10 +117,10 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
-  PADDLE_ENFORCE_GT(
-      numel(), 0,
-      "When calling this method, the Tensor's numel must be larger than zero. "
-      "Please check Tensor::Resize has been called first.");
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
  int64_t size = numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||

--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
  EXPECT_EQ(total_size, 0UL);

  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
    ps[paddle::memory::Alloc(cpu, size)] = size;

    // Buddy Allocator doesn't manage too large memory chunk
@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
  EXPECT_EQ(total_size, 0UL);

  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
    ps[paddle::memory::Alloc(gpu, size)] = size;

    // Buddy Allocator doesn't manage too large memory chunk

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -12,6 +12,8 @@ function(op_library TARGET)
    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
    set(cc_srcs)
    set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
@@ -36,10 +38,19 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
            list(APPEND cu_srcs ${TARGET}.cu)
        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
        if(WITH_MKLDNN)
            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
@@ -48,10 +59,14 @@ function(op_library TARGET)
        endif()
    else()
        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.cu$")
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
@@ -76,6 +91,9 @@ function(op_library TARGET)
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
    else()
        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
@@ -88,7 +106,7 @@ function(op_library TARGET)
        endif()
    endforeach()

-    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
    file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -114,7 +132,10 @@ function(op_library TARGET)
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -125,9 +146,19 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()

+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
+
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
    endif()

    # pybind USE_OP

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "mkldnn_activation_op.h"
+#include "paddle/fluid/operators/activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+namespace {
+template <typename T, typename ExecContext>
+void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                     const T alpha = 0, const T beta = 0) {
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                 "It must use CPUPlace.");
+
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  // get buffers
+  const auto *src = ctx.template Input<Tensor>("X");
+  const auto *src_data = src->template data<T>();
+
+  auto *dst = ctx.template Output<Tensor>("Out");
+  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+
+  // get memory dim
+  PADDLE_ENFORCE(src->dims().size() == 4,
+                 "Input dim must be with 4, i.e. NCHW");
+  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+
+  // create memory description
+  // TODO(kbinias-intel): support more formats
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
+  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+
+  auto forward_desc = mkldnn::eltwise_forward::desc(
+      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
+
+  // save prim desc into global device context to be referred in backward path
+  const std::string key = ctx.op().Output("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+      forward_desc, mkldnn_engine);
+  dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
+
+  auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
+
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+template <typename T, typename ExecContext>
+void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                  const T alpha = 0, const T beta = 0) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  // get buffers
+  const auto *x = ctx.template Input<Tensor>("X");
+  const auto *src = x->template data<T>();
+
+  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+  const auto *diff_dst = dout->template data<T>();
+
+  auto *dx =
+      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+
+  // get memory dim
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+
+  // create memory description
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto diff_src_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+  auto diff_dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+
+  auto backward_desc =
+      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
+
+  // retrieve eltwise primitive desc from device context
+  const std::string key = ctx.op().Input("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
+  PADDLE_ENFORCE(forward_pd != nullptr,
+                 "Fail to find eltwise_pd in device context");
+  auto *p_forward_pd =
+      static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
+
+  auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+      backward_desc, mkldnn_engine, *p_forward_pd);
+
+  auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
+                                              diff_dst_memory, diff_src_memory);
+
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+}  // anonymous namespace
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_forward<T>(ctx, algorithm);
+  }
+};
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_grad<T>(ctx, algorithm);
+  }
+};
+
+template <typename T>
+using ReluMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
+
+template <typename T>
+using ReluMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,       \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);    \
+  REGISTER_OP_KERNEL(                                                      \
+      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+
+#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
+  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
+  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
+  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
+  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+
+FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"

 namespace paddle {
 namespace operators {
@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu operator");
    AddOutput("Out", "Output of Relu operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Relu Activation Operator.

@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Out", "Output of Tanh operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Tanh Activation Operator.

@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Out", "Output of Sqrt operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Sqrt Activation Operator.

@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Abs operator");
    AddOutput("Out", "Output of Abs operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Abs Activation Operator.

@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
            ops::ActivationOpGrad);

-REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
+            ops::ActivationWithMKLDNNOpGrad);

-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationWithMKLDNNOpGrad);

 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
            tanh_shrink_grad, ops::ActivationOpGrad);
@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
 REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
            softshrink_grad, ops::ActivationOpGrad);

-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationWithMKLDNNOpGrad);

-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationWithMKLDNNOpGrad);

 REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
            ops::ActivationOpGrad);

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"

+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -457,12 +457,39 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
  }
 };

+class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("batch_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+    op->SetInput("Scale", Input("Scale"));
+    op->SetInput("SavedMean", Output("SavedMean"));
+    op->SetInput("SavedVariance", Output("SavedVariance"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-            batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+                  ops::BatchNormGradMaker);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
+
 REGISTER_OP_CPU_KERNEL(
    batch_norm,
    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -126,6 +126,7 @@ width and height.
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
                       ops::BoxCoderKernel<double>);
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -146,8 +146,9 @@ class GrpcByteBufferSource

 class GrpcByteBufferSourceWrapper : public Source {
 public:
-  GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {}
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
    return source_;
  }


--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -21,15 +21,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-
+#include "paddle/fluid/operators/detail/grpc_service.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-#include "paddle/fluid/operators/detail/grpc_service.h"
-
-//#include <grpc/support/log.h>
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
  EXPECT_EQ(resp.Parse(msg), 0);

  framework::Variable* var2 = resp.GetVar();
@@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  // deserialize zero-copy
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
  if (from_type == 0) {
    EXPECT_EQ(resp.Parse(msg), 0);
  } else {
@@ -194,24 +194,23 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
 }

-TEST(LodTensor, GPU) {
-  platform::CUDAPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
-}
-
-TEST(LodTensor, CPU) {
+TEST(LodTensor, Run) {
  platform::CPUPlace place;
  RunTestLodTensor(place);
  RunTestLodTensor(place, 1);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+  RunTestLodTensor(gpu, 1);
+#endif
 }

-TEST(SelectedRows, CPU) {
+TEST(SelectedRows, Run) {
  platform::CPUPlace place;
  RunSerdeTestSelectedRows(place);
-}

-TEST(SelectedRows, GPU) {
-  platform::CUDAPlace place;
-  RunSerdeTestSelectedRows(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
+#endif
 }
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -36,9 +36,9 @@ class VariableResponse {
 public:
  VariableResponse(const framework::Scope* scope,
                   const platform::DeviceContext* dev_ctx)
-      : scope_(scope), dev_ctx_(dev_ctx){};
+      : scope_(scope), dev_ctx_(dev_ctx) {}

-  virtual ~VariableResponse(){};
+  virtual ~VariableResponse() {}

  // return:
  // 0:ok.

--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -188,8 +188,8 @@ The general steps are as follows. First, calculate the true positive and
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(detection_map, ops::DetectionMAPOp,
-                             ops::DetectionMAPOpMaker);
+REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
    detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
    ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -33,6 +33,7 @@ __global__ void RandomGenerator(const size_t n, const int seed,

  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    rng.discard(idx);
    if (dist(rng) < dropout_prob) {
      mask_data[idx] = static_cast<T>(0);
    } else {

--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/iou_similarity_op.cc
@@ -87,8 +87,9 @@ $$
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
-                             ops::IOUSimilarityOpMaker);
+REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp,
+                  ops::IOUSimilarityOpMaker,
+                  paddle::framework::EmptyGradOpMaker);

 REGISTER_OP_CPU_KERNEL(
    iou_similarity,

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -139,26 +139,25 @@ class ListenAndServOp : public framework::OperatorBase {
      // should be global ops.
      // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
      // and this will still work.
+
      std::vector<std::future<void>> fs;
      // block0 contains only listen_and_serv op, start run from block1.
      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
-        fs.push_back(framework::Async([&executor, &program, &recv_scope,
-                                       blkid]() {
-          int run_block = blkid;  // thread local
-          try {
-            executor.Run(*program, &recv_scope, run_block,
-                         false /*create_local_scope*/, false /*create_vars*/);
-          } catch (std::exception &e) {
-            LOG(ERROR) << "run sub program error " << e.what();
-          }
-        }));
+        fs.push_back(
+            framework::Async([&executor, &program, &recv_scope, blkid]() {
+              int run_block = blkid;  // thread local
+              try {
+                executor.Run(*program, &recv_scope, run_block, false, false);
+              } catch (std::exception &e) {
+                LOG(ERROR) << "run sub program error " << e.what();
+              }
+            }));
      }
      for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
      // Run global block at final step, or block1 if there are only 2 blocks
      if (num_blocks >= 2) {
        try {
-          executor.Run(*program, &recv_scope, num_blocks - 1,
-                       false /*create_local_scope*/, false /*create_vars*/);
+          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
        } catch (std::exception &e) {
          LOG(ERROR) << "run sub program error " << e.what();
        }
@@ -177,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase {
      rpc_service_->WaitClientGet(fan_in);
      sparse_vars.clear();
    }  // while(true)
+
+    // for (int i = 0; i < num_blocks; ++i) {
+    //   delete blk_ctx_list[i];
+    // }
  }

 protected:

--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/lrn_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+namespace {
+template <typename T, typename... Args>
+std::shared_ptr<T> insert_to_context(const std::string& key,
+                                     const MKLDNNDeviceContext& dev_ctx,
+                                     Args&&... args) {
+  auto p = std::static_pointer_cast<T, void>(dev_ctx.GetBlob(key));
+
+  if (!p) {
+    p = std::make_shared<T>(args...);
+    dev_ctx.SetBlob(key, std::static_pointer_cast<void, T>(p));
+  }
+
+  return p;
+}
+}  // namespace
+
+template <typename T>
+class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x = ctx.Input<Tensor>("X");
+    auto out = ctx.Output<Tensor>("Out");
+    auto mid = ctx.Output<Tensor>("MidOut");
+
+    auto input_data = x->data<T>();
+    auto output_data = out->mutable_data<T>(ctx.GetPlace());
+    mid->mutable_data<T>(ctx.GetPlace());
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
+                                                  mkldnn::lrn_across_channels,
+                                                  src_md,
+                                                  n,
+                                                  alpha,
+                                                  beta,
+                                                  k};
+
+    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
+                                     static_cast<void*>(output_data)};
+
+    std::unique_ptr<mkldnn::lrn_forward> forward_op = nullptr;
+
+    if (!is_test) {
+      const std::string key = ctx.op().Output("Out");
+      const std::string key_src_memory = key + "@lrn_src_memory";
+      const std::string key_pd = key + "@lrn_pd";
+      const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+      auto forward_pd = insert_to_context<mkldnn::lrn_forward::primitive_desc>(
+          key_pd, dev_ctx, forward_desc, mkldnn_engine);
+
+      auto src_memory = insert_to_context<mkldnn::memory>(
+          key_src_memory, dev_ctx, src_memory_pd);
+
+      src_memory->set_data_handle(
+          static_cast<void*>(const_cast<T*>(input_data)));
+
+      auto workspace_memory = insert_to_context<mkldnn::memory>(
+          key_workspace_memory, dev_ctx,
+          forward_pd->workspace_primitive_desc());
+
+      forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory,
+                                               *workspace_memory, dst_memory});
+
+    } else {
+      auto forward_pd =
+          mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
+      auto src_memory = mkldnn::memory{
+          src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
+      auto workspace_memory =
+          mkldnn::memory{forward_pd.workspace_primitive_desc()};
+
+      forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory,
+                                               workspace_memory, dst_memory});
+    }
+
+    std::vector<mkldnn::primitive> pipeline = {*forward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
+template <typename T>
+class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+
+    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_src_memory = key + "@lrn_src_memory";
+    const std::string key_pd = key + "@lrn_pd";
+    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+    auto out_grad_data = out_grad->data<T>();
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_memory =
+        mkldnn::memory{{diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<float*>(out_grad_data))};
+
+    auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
+                                          static_cast<void*>(x_grad_data)};
+
+    auto backward_desc = mkldnn::lrn_backward::desc{
+        mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k};
+
+    auto forward_pd = dev_ctx.GetBlob(key_pd);
+
+    auto backward_pd = mkldnn::lrn_backward::primitive_desc{
+        backward_desc, mkldnn_engine,
+        *static_cast<mkldnn::lrn_forward::primitive_desc*>(forward_pd.get())};
+
+    std::shared_ptr<void> workspace_memory =
+        dev_ctx.GetBlob(key_workspace_memory);
+
+    auto src_memory = dev_ctx.GetBlob(key_src_memory);
+    auto backward_op = mkldnn::lrn_backward{
+        backward_pd, *static_cast<mkldnn::memory*>(src_memory.get()),
+        diff_dst_memory, *static_cast<mkldnn::memory*>(workspace_memory.get()),
+        diff_src_memory};
+
+    std::vector<mkldnn::primitive> pipeline = {backward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/lrn_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif

 namespace paddle {
 namespace operators {
@@ -116,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
 template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
 template struct LRNGradFunctor<platform::CPUDeviceContext, double>;

+namespace {
+framework::OpKernelType GetExpectedLRNKernel(
+    const framework::ExecutionContext& ctx) {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+  }
+#endif
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+}  // namespace
+
 class LRNOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -135,6 +158,11 @@ class LRNOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("MidOut", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
+  }
 };

 template <typename T>
@@ -176,6 +204,17 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
               "beta is the power number.")
        .SetDefault(0.75)
        .GreaterThan(0.0);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddAttr<bool>("is_test", "").SetDefault(false);

    AddComment(R"DOC(
 Local Response Normalization Operator.
@@ -223,8 +262,12 @@ class LRNOpGrad : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
  }
-};

+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
+  }
+};
 }  // namespace operators
 }  // namespace paddle


--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,6 +6,7 @@ function(math_library TARGET)
    # But it handle split GPU/CPU code and link some common library.
    set(cc_srcs)
    set(cu_srcs)
+    set(hip_srcs)
    set(math_common_deps device_context framework_proto)
    set(multiValueArgs DEPS)
    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
@@ -17,10 +18,15 @@ function(math_library TARGET)
    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
        list(APPEND cu_srcs ${TARGET}.cu)
    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+        list(APPEND hip_srcs ${TARGET}.hip.cu)
+    endif()

    list(LENGTH cc_srcs cc_srcs_len)
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
    elseif(${cc_srcs_len} GREATER 0)
        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
    endif()

--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -20,7 +20,7 @@ namespace math {

 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CPUDeviceContext, T> {
@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {

 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }

 template <typename T>
-__global__ void KernelConcat(T** inputs, const int input_col,
-                             const int output_rows, const int output_cols,
-                             T* output) {
+__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
+                             const int out_rows, const int out_cols,
+                             T* output_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* input_ptr = inputs[split];
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
+    int in_offset = tid_x - split * fixed_in_col;
+    T* input_ptr = inputs_data[split];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * input_col + in_offset];
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
+      output_data[tid_y * out_cols + tid_x] =
+          input_ptr[tid_y * fixed_in_col + in_offset];
    }
  }
 }

 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int* output_cols,
-                                 int col_size, T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int* out_cols,
+                                 int out_cols_size, T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
-  int curr_offset = output_cols[segment];
+  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
+  int curr_offset = out_cols[segment];
  int curr_segment = segment;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
    T curr_col_offset;
-    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
    }

    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs[curr_segment];
+    T* output_ptr = outputs_data[curr_segment];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
      output_ptr[tid_y * segment_width + local_col] =
-          input[tid_y * input_col + tid_x];
+          input_data[tid_y * in_col + tid_x];
  }
 }

 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int output_cols,
-                                 T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int fixed_out_col,
+                                 T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* output_ptr = outputs[split];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * output_cols + in_offset] =
-          input[tid_y * input_col + tid_x];
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * fixed_out_col + in_offset] =
+          input_data[tid_y * in_col + tid_x];
  }
 }

 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CUDADeviceContext, T> {
@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
                  const std::vector<framework::Tensor>& input, const int axis,
                  framework::Tensor* output) {
    // TODO(zcd): Add input data validity checking
-    int num = input.size();
-    int rows = 1;
+    int in_num = input.size();
+    int in_row = 1;
    auto dim_0 = input[0].dims();
    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
+      in_row *= dim_0[i];
    }
-    int cols = input[0].numel() / rows;
-    int out_rows = rows, out_cols = 0;
+    int in_col = input[0].numel() / in_row;
+    int out_row = in_row, out_col = 0;

-    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_cols(num + 1);
-    inputs_cols[0] = 0;
+    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_col(in_num + 1);
    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());

+    inputs_col[0] = 0;
    bool sameShape = true;
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
+    for (int i = 0; i < in_num; ++i) {
+      int t_cols = input[i].numel() / in_row;
      if (sameShape) {
-        if (t_cols != cols) sameShape = false;
+        if (t_cols != in_col) sameShape = false;
      }
-      out_cols += t_cols;
-      inputs_cols[i + 1] = out_cols;
+      out_col += t_cols;
+      inputs_col[i + 1] = out_col;
      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
    }

-    T** ins_gpu =
+    T** dev_ins_data =
        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());

    // computation
    // set the thread block and grid according to CurrentDeviceId
    const int kThreadsPerBlock = 1024;
    int block_cols = kThreadsPerBlock;
-    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((out_cols + 31) >> 5) << 5;
+    if (out_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_col + 31) >> 5) << 5;
    }
    int block_rows = kThreadsPerBlock / block_cols;
    dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);

    int grid_cols =
-        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+        std::min((out_col + block_cols - 1) / block_cols, max_blocks);
    int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);

    if (sameShape) {
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+          dev_ins_data, in_col, out_row, out_col, output->data<T>());
    } else {
+      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
-          out_cols, output->data<T>());
+          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
+          out_row, out_col, output->data<T>());
    }
  }
 };

 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& input, const int axis,
                  std::vector<framework::Tensor>& outputs) {
    // TODO(zcd): Add input data validity checking
-    int num = outputs.size();
-    int input_row = 1;
+    int o_num = outputs.size();
+    int out_row = 1;
    auto dim_0 = outputs[0].dims();
    for (int i = 0; i < axis; ++i) {
-      input_row *= dim_0[i];
+      out_row *= dim_0[i];
    }

-    int output_col_0 = outputs[0].numel() / input_row;
-    int input_col = 0;
+    int out_col = outputs[0].numel() / out_row;
+    int in_col = 0, in_row = out_row;
    bool sameShape = true;

-    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(num + 1);
-    outputs_cols[0] = 0;
+    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(o_num + 1);
    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());

-    for (int i = 0; i < num; ++i) {
-      int t_col = outputs[i].numel() / input_row;
+    outputs_cols[0] = 0;
+    for (int i = 0; i < o_num; ++i) {
+      int t_col = outputs[i].numel() / out_row;
      if (sameShape) {
-        if (t_col != output_col_0) sameShape = false;
+        if (t_col != out_col) sameShape = false;
      }
-      input_col += t_col;
-      outputs_cols[i + 1] = input_col;
+      in_col += t_col;
+      outputs_cols[i + 1] = in_col;
      outputs_ptr[i] = outputs[i].data<T>();
    }

-    T** outs_gpu =
+    T** dev_out_gpu_data =
        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());

    // computation
    const int kThreadsPerBlock = 1024;
    int block_cols = kThreadsPerBlock;
-    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((input_col + 31) >> 5) << 5;
+    if (in_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((in_col + 31) >> 5) << 5;
    }
    int block_rows = kThreadsPerBlock / block_cols;
    dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);

    int grid_cols =
-        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+        std::min((in_col + block_cols - 1) / block_cols, max_blocks);
    int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);

    if (sameShape) {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
    } else {
+      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, outs_col_gpu,
-          static_cast<int>(outputs_cols.size()), outs_gpu);
+          input.data<T>(), in_row, in_col, dev_outs_col_data,
+          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
    }
  }
 };

--- a/paddle/fluid/operators/math/concat.hip.cu
+++ b/paddle/fluid/operators/math/concat.hip.cu
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <hip/hip_runtime.h>
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -324,8 +324,9 @@ MatchIndices elements with value -1.
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
-                             ops::MineHardExamplesOpMaker);
+REGISTER_OPERATOR(mine_hard_examples, ops::MineHardExamplesOp,
+                  ops::MineHardExamplesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);

 REGISTER_OP_CPU_KERNEL(
    mine_hard_examples,

--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
+                   "Cannot get input tensor X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
+                   "Cannot find output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+namespace {
+framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel& oper) {
+  framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+  }
+#endif
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      ctx.GetPlace(), layout, library);
+}
+}  // anonymous namespace
+
+class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+
+class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -168,7 +168,9 @@ https://arxiv.org/abs/1512.02325.
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
+REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
 REGISTER_OP_CPU_KERNEL(
    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
@@ -27,6 +27,7 @@ namespace operators {

 static constexpr char kX[] = "X";
 static constexpr char kCaseToExecute[] = "case_to_execute";
+static constexpr char kOutputs[] = "Out";

 static constexpr char kCases[] = "cases";
 static constexpr char kCasesBlock[] = "sub_block";
@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
             "(Int) The variable the sets the index of the case to execute, "
             "after evaluating the channels being sent to and received from")
        .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the cases of Select Op.")
+        .AsDuplicable();
    AddAttr<std::vector<std::string>>(kCases,
                                      "(String vector) Serialized list of"
                                      "all cases in the select op. Each"

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */

 #include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace operators {
@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);

+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
    auto client_var_name = Output("RPCClient");
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
                            "Can not find variable '%s' in the scope.",
@@ -68,7 +72,7 @@ class SendOp : public framework::OperatorBase {

    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
@@ -77,20 +81,20 @@ class SendOp : public framework::OperatorBase {
    PADDLE_ENFORCE(rpc_client->Wait());

    for (auto& ep : endpoints) {
-      VLOG(3) << "batch barrier, ep: " << ep;
+      VLOG(2) << "batch barrier, ep: " << ep;
      rpc_client->AsyncSendBatchBarrier(ep);
    }
    PADDLE_ENFORCE(rpc_client->Wait());

    if (outs.size() > 0) {
      for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
      }
      PADDLE_ENFORCE(rpc_client->Wait());
      // tell pservers that current trainer have called fetch
      for (auto& ep : endpoints) {
-        VLOG(3) << "send fetch barrier, ep: " << ep;
+        VLOG(2) << "send fetch barrier, ep: " << ep;
        rpc_client->AsyncSendFetchBarrier(ep);
      }
      PADDLE_ENFORCE(rpc_client->Wait());

--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/target_assign_op.cc
@@ -153,8 +153,8 @@ template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
-                             ops::TargetAssignOpMaker);
+REGISTER_OPERATOR(target_assign, ops::TargetAssignOp, ops::TargetAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
    target_assign,
    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)

-cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)


--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
 if(WITH_PYTHON)
-  cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
-    ${GLOB_OP_LIB})
-  if(NOT APPLE AND NOT ANDROID)
-    target_link_libraries(paddle_pybind rt)
-  endif(NOT APPLE AND NOT ANDROID)
+  if(WITH_AMD_GPU)
+    hip_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      ${GLOB_OP_LIB})
+  else()
+    cc_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      ${GLOB_OP_LIB})
+    if(NOT APPLE AND NOT ANDROID)
+      target_link_libraries(paddle_pybind rt)
+    endif(NOT APPLE AND NOT ANDROID)
+  endif(WITH_AMD_GPU)
 endif(WITH_PYTHON)
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -37,6 +37,7 @@ function cmake_gen() {
        -DWITH_DSO=ON
        -DWITH_DOC=${WITH_DOC:-OFF}
        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
        -DWITH_MKL=${WITH_MKL:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
@@ -50,6 +51,7 @@ function cmake_gen() {
        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
        -DWITH_TESTING=${WITH_TESTING:-ON}
        -DWITH_FAST_BUNDLE_TEST=ON
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
    ========================================
 EOF
@@ -62,6 +64,7 @@ EOF
        -DWITH_DSO=ON \
        -DWITH_DOC=${WITH_DOC:-OFF} \
        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
        -DWITH_MKL=${WITH_MKL:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -74,6 +77,7 @@ EOF
        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
        -DWITH_TESTING=${WITH_TESTING:-ON} \
        -DWITH_FAST_BUNDLE_TEST=ON \
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }


--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from layers.control_flow import BlockGuard, Select
+from layers.control_flow import BlockGuard, equal
+from .framework import Operator
 from layer_helper import LayerHelper, unique_name
 from layers import fill_constant
 import core
@@ -75,6 +76,185 @@ class Go(BlockGuard):
            attrs={'sub_block': go_block})


+class SelectCase(object):
+    DEFAULT = 0
+    SEND = 1
+    RECEIVE = 2
+
+    def __init__(self,
+                 case_idx,
+                 case_to_execute,
+                 channel_action_fn=None,
+                 channel=None,
+                 value=None):
+        self.helper = LayerHelper('conditional_block')
+        self.main_program = self.helper.main_program
+        self.is_scalar_condition = True
+
+        self.case_to_execute = case_to_execute
+        self.idx = case_idx
+
+        # Since we aren't going to use the `channel_send` or `channel_recv`
+        # functions directly, we just need to capture the name.
+        self.action = (self.SEND
+                       if channel_action_fn.__name__ == ('channel_send') else
+                       self.RECEIVE) if channel_action_fn else self.DEFAULT
+        self.value = value
+        self.channel = channel
+
+    def __enter__(self):
+        self.block = self.main_program.create_block()
+
+    def construct_op(self):
+        main_program = self.helper.main_program
+        cases_block = main_program.current_block()
+
+        inner_outputs = set()
+        input_set = set()
+        params = set()
+
+        for op in self.block.ops:
+            # Iterate over all operators, get all the inputs
+            # and add as input to the SelectCase operator.
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        input_set.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        param_list = [
+            cases_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        # Iterate over all operators, get all the outputs
+        # add to the output list of SelectCase operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in cases_block.vars:
+                out_vars.append(cases_block.var(inner_out_name))
+
+        # First, create an op that will determine whether or not this is the
+        # conditional variable to execute.
+        should_execute_block = equal(
+            fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
+            self.case_to_execute)
+
+        step_scope = cases_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        cases_block.append_op(
+            type='conditional_block',
+            inputs={'X': [should_execute_block],
+                    'Params': param_list},
+            outputs={'Out': out_vars,
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': self.block,
+                'is_scalar_condition': self.is_scalar_condition
+            })
+
+        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
+                                if self.channel else '', self.value.name
+                                if self.value else '')
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class Select(BlockGuard):
+    def __init__(self, name=None):
+        self.helper = LayerHelper('select', name=name)
+        self.cases = []
+
+        super(Select, self).__init__(self.helper.main_program)
+        self.case_to_execute = fill_constant(
+            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
+
+    def __enter__(self):
+        super(Select, self).__enter__()
+        return self
+
+    def case(self, channel_action_fn, channel, value):
+        """Create a new block for this condition.
+        """
+        select_case = SelectCase(
+            len(self.cases), self.case_to_execute, channel_action_fn, channel,
+            value)
+
+        self.cases.append(select_case)
+
+        return select_case
+
+    def default(self):
+        """Create a default case block for this condition.
+        """
+        default_case = SelectCase(len(self.cases), self.case_to_execute)
+
+        self.cases.append(default_case)
+
+        return default_case
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+
+        # Create a select op and another block to wrap its
+        # case blocks.
+        select_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(select_block.parent_idx)
+
+        # Construct each case op, inside the newly created select block.
+        serialized_cases = []
+        for case in self.cases:
+            serialized_cases.append(case.construct_op())
+
+        intermediate = set()
+        params = set()
+
+        for case_block in select_block.ops:
+            if case_block.attrs and 'sub_block' in case_block.attrs:
+                for each_op in case_block.attrs['sub_block'].ops:
+                    assert isinstance(each_op, Operator)
+                    for iname in each_op.input_names:
+                        for in_var_name in each_op.input(iname):
+                            if in_var_name not in intermediate:
+                                params.add(in_var_name)
+
+                    for oname in each_op.output_names:
+                        for out_var_name in each_op.output(oname):
+                            intermediate.add(out_var_name)
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name in intermediate
+        ]
+
+        X = [select_block.var_recursive(x_name) for x_name in params]
+
+        # Needs to be used by `equal` inside the cases block.
+        X.append(self.case_to_execute)
+
+        # Construct the select op.
+        parent_block.append_op(
+            type='select',
+            inputs={'X': X,
+                    'case_to_execute': self.case_to_execute},
+            attrs={'sub_block': select_block,
+                   'cases': serialized_cases},
+            outputs={'Out': out_list})
+
+        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
+
+
 def make_channel(dtype, capacity=0):
    """
    Helps implementation of a concurrent program by creating a "channel" of

--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -565,6 +565,8 @@ class DistributeTranspiler:
        orig_var_name = ""
        if suff_idx >= 0:
            orig_var_name = varname[:suff_idx]
+        else:
+            orig_var_name = varname
        return orig_var_name

    def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
@@ -579,7 +581,8 @@ class DistributeTranspiler:
                grad_block = None
                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
                    if same_or_split_var(
-                            self._orig_varname(g.name), opt_op.input(key)[0]):
+                            self._orig_varname(g.name),
+                            self._orig_varname(opt_op.input(key)[0])):
                        grad_block = g
                        break
                if not grad_block:
@@ -750,7 +753,7 @@ class DistributeTranspiler:
        param_names = [
            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
        ]
-        if op.input("Param") in param_names:
+        if op.input("Param")[0] in param_names:
            return True
        else:
            for n in param_names:

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -403,6 +403,8 @@ class LayerHelper(object):
        if 'use_mkldnn' in self.kwargs:
            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        act_type = act.pop('type')
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        self.append_op(
            type=act_type,
            inputs={"X": [input_var]},

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -16,7 +16,7 @@ import contextlib
 from layer_function_generator import autodoc
 from tensor import assign, fill_constant
 from .. import core
-from ..framework import Program, Variable, Operator, Block
+from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ops import logical_and, logical_not, logical_or

@@ -29,7 +29,6 @@ __all__ = [
    'WhileGuard',
    'While',
    'Switch',
-    'Select',
    'lod_rank_table',
    'max_sequence_len',
    'topk',
@@ -1212,186 +1211,6 @@ class Switch(object):
        return True


-class SelectCase(object):
-    DEFAULT = 0
-    SEND = 1
-    RECEIVE = 2
-
-    def __init__(self,
-                 case_idx,
-                 case_to_execute,
-                 channel_action_fn=None,
-                 channel=None,
-                 value=None):
-        self.helper = LayerHelper('conditional_block')
-        self.main_program = self.helper.main_program
-        self.is_scalar_condition = True
-
-        self.case_to_execute = case_to_execute
-        self.idx = case_idx
-
-        # Since we aren't going to use the `channel_send` or `channel_recv`
-        # functions directly, we just need to capture the name.
-        self.action = (self.SEND
-                       if channel_action_fn.__name__ == ('channel_send') else
-                       self.RECEIVE) if channel_action_fn else (self.DEFAULT)
-        self.value = value
-        self.channel = channel
-
-    def __enter__(self):
-        self.block = self.main_program.create_block()
-
-    def construct_op(self):
-        main_program = self.helper.main_program
-        cases_block = main_program.current_block()
-
-        inner_outputs = set()
-        input_set = set()
-        params = set()
-
-        for op in self.block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the SelectCase operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        input_set.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        param_list = [
-            cases_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        # Iterate over all operators, get all the outputs
-        # add to the output list of SelectCase operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in cases_block.vars:
-                out_vars.append(cases_block.var(inner_out_name))
-
-        # First, create an op that will determine whether or not this is the
-        # conditional variable to execute.
-        should_execute_block = equal(
-            fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
-            self.case_to_execute)
-
-        step_scope = cases_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        cases_block.append_op(
-            type='conditional_block',
-            inputs={'X': [should_execute_block],
-                    'Params': param_list},
-            outputs={'Out': out_vars,
-                     'Scope': [step_scope]},
-            attrs={
-                'sub_block': self.block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
-
-        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
-                                if self.channel else '', self.value.name
-                                if self.value else '')
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class Select(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper('select', name=name)
-        self.cases = []
-
-        super(Select, self).__init__(self.helper.main_program)
-        self.case_to_execute = fill_constant(
-            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
-
-    def __enter__(self):
-        super(Select, self).__enter__()
-        return self
-
-    def case(self, channel_action_fn, channel, value):
-        """Create a new block for this condition.
-        """
-        select_case = SelectCase(
-            len(self.cases), self.case_to_execute, channel_action_fn, channel,
-            value)
-
-        self.cases.append(select_case)
-
-        return select_case
-
-    def default(self):
-        """Create a default case block for this condition.
-        """
-        default_case = SelectCase(len(self.cases), self.case_to_execute)
-
-        self.cases.append(default_case)
-
-        return default_case
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-
-        # Create a select op and another block to wrap its
-        # case blocks.
-        select_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(select_block.parent_idx)
-
-        # Construct each case op, inside the newly created select block.
-        serialized_cases = []
-        for case in self.cases:
-            serialized_cases.append(case.construct_op())
-
-        intermediate = set()
-        params = set()
-
-        for case_block in select_block.ops:
-            if case_block.attrs and 'sub_block' in case_block.attrs:
-                for each_op in case_block.attrs['sub_block'].ops:
-                    assert isinstance(each_op, Operator)
-                    for iname in each_op.input_names:
-                        for in_var_name in each_op.input(iname):
-                            if in_var_name not in intermediate:
-                                params.add(in_var_name)
-
-                    for oname in each_op.output_names:
-                        for out_var_name in each_op.output(oname):
-                            intermediate.add(out_var_name)
-
-        # TODO(varunarora): Figure out if defining output is needed.
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
-
-        X = [select_block.var_recursive(x_name) for x_name in params]
-
-        # Needs to be used by `equal` inside the cases block.
-        X.append(self.case_to_execute)
-
-        # Construct the select op.
-        parent_block.append_op(
-            type='select',
-            inputs={'X': X,
-                    'case_to_execute': self.case_to_execute},
-            attrs={'sub_block': select_block,
-                   'cases': serialized_cases},
-            outputs={})
-
-        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
-
-
 class IfElseBlockGuard(object):
    def __init__(self, is_true, ifelse):
        if not isinstance(ifelse, IfElse):

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -129,13 +129,11 @@ def detection_output(loc,
        prior_box_var=prior_box_var,
        target_box=loc,
        code_type='decode_center_size')
-
    old_shape = scores.shape
    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
    scores = nn.softmax(input=scores)
    scores = ops.reshape(x=scores, shape=old_shape)
    scores = nn.transpose(scores, perm=[0, 2, 1])
-
    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
    helper.append_op(
        type="multiclass_nms",
@@ -475,6 +473,7 @@ def ssd_loss(location,
    # 2. Compute confidence for mining hard examples
    # 2.1. Get the target label based on matched indices
    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label.stop_gradient = True
    target_label, _ = target_assign(
        gt_label, matched_indices, mismatch_value=background_label)
    # 2.2. Compute confidence loss.
@@ -482,10 +481,12 @@ def ssd_loss(location,
    confidence = __reshape_to_2d(confidence)
    target_label = tensor.cast(x=target_label, dtype='int64')
    target_label = __reshape_to_2d(target_label)
+    target_label.stop_gradient = True
    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)

    # 3. Mining hard examples
    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss.stop_gradient = True
    neg_indices = helper.create_tmp_variable(dtype='int32')
    dtype = matched_indices.dtype
    updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
@@ -695,6 +696,8 @@ def multi_box_head(inputs,
            outputs={"Boxes": box,
                     "Variances": var},
            attrs=attrs, )
+        box.stop_gradient = True
+        var.stop_gradient = True
        return box, var

    def _reshape_with_axis_(input, axis=1):

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -506,5 +506,54 @@ class TestSwish(OpTest):
        self.check_grad(['X'], 'Out', max_relative_error=0.008)


+#--------------------test MKLDNN--------------------
+class TestMKLDNNRelu(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNRelu, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanh(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanh, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrt(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrt, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbs(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbs, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -20,19 +20,35 @@ from op_test import OpTest
 class TestConcatOp(OpTest):
    def setUp(self):
        self.op_type = "concat"
-        x0 = np.random.random((2, 1, 4, 5)).astype('float32')
-        x1 = np.random.random((2, 2, 4, 5)).astype('float32')
-        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        axis = 1
-        self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]}
-        self.attrs = {'axis': axis}
-        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.axis)
+        }

    def test_check_output(self):
        self.check_output()

    def test_check_grad(self):
        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
+
+
+class TestConcatOp2(OpTest):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -87,5 +87,15 @@ class TestLRNOp(OpTest):
        self.check_grad(['X'], 'Out', max_relative_error=0.01)


+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase):
    def test_lod_tensor_gpu_init(self):
        if not core.is_compiled_with_cuda():
            return
-        scope = core.Scope()
        place = core.CUDAPlace(0)
        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
        lod_tensor = core.LoDTensor()
@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase):
        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
        self.assertListEqual(lod_py, lod_tensor.lod())

+    def test_empty_tensor(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        var = scope.var("test_tensor")
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([0, 1])
+        tensor.alloc_float(place)
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((0, 1), tensor_array.shape)
+
+        if core.is_compiled_with_cuda():
+            gpu_place = core.CUDAPlace(0)
+            tensor.alloc_float(gpu_place)
+            tensor_array = numpy.array(tensor)
+            self.assertEqual((0, 1), tensor_array.shape)
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """

-__all__ = ['np_array', 'text_file', "cloud_reader"]
+__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']


 def np_array(x):