Merge remote-tracking branch 'upstream/develop' into windows/build

36cd18b5 · peizhilin · b2f8d418 · 39ec80de · 36cd18b5 · 36cd18b5
78 changed file
--- a/.github/ISSUE_TEMPLATE/---feature-request-.md
+++ b/.github/ISSUE_TEMPLATE/---feature-request-.md
+---
+name: 建议(Feature request)
+about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue.
+
+---
+
+欢迎您对PaddlePaddle提出建议，非常感谢您对PaddlePaddle的贡献！
+在留下您的建议时，辛苦您同步提供如下信息：
+- 版本、环境信息
+1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1
+2）CPU/GPU：您是否使用GPU进行训练，如是，请提供您的CUDA和cuDNN版本号
+3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 建议描述：请您详细描述，您认为需优化的功能
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+Please make sure that this is a feature request. 
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe the feature and the current behavior/state.**
+**Any Other info.**
--- a/.github/ISSUE_TEMPLATE/---inference-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md
+---
+name: 预测（Inference Issue）
+about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准描述您的问题，例如“最新预测库的API文档在哪儿 ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本（如Mac OS 10.14），Python版本
+-预测信息
+    1）C++预测：请您提供预测库安装包的版本信息，及其中的version.txt文件
+    2）CMake包含路径的完整命令
+    3）API信息（如调用请提供）
+    4）预测库来源：官网下载/特殊环境（如BCLOUD编译）
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that th
+If there is no solution,please make sure that this is an inference issue including the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Cmake orders
+-C++version.txt
+-API information
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
--- a/.github/ISSUE_TEMPLATE/---installation-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md
+---
+name: 安装（Installation Issue）
+about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：请包含关键词“安装错误”/“编译错误”，例如“Mac编译错误”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（如Mac OS 10.14）、Python版本
+- 安装方式信息：
+1）pip安装/docker安装
+2）本地编译：请提供cmake命令，编译命令
+3）docker编译：请提供docker镜像，编译命令            
+  特殊环境请注明：如离线安装等
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is an installation issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg. Mac OS 10.14)
+-Python version
+- Install method: pip install/install with docker/build from source(without docker)/build within docker
+- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
--- a/.github/ISSUE_TEMPLATE/---model-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---model-issue-.md
+---
+name: 模型（Model Issue）
+about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/
+  algorithm/dataset  issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：简洁、精准描述您的问题，例如“ssd 模型前置lstm报错  ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（例如Mac OS 10.14），Python版本
+- 模型信息
+    1）模型名称 2）使用数据集名称 3）使用算法名称 4）模型链接
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a issue of models including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Name of Models&Dataset/details of operator
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
--- a/.github/ISSUE_TEMPLATE/---others-.md
+++ b/.github/ISSUE_TEMPLATE/---others-.md
+---
+name: 其他（Others）
+about: 如上述分类未包含您的问题，可在此提出。 You could use this template for reporting other issues
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU/GPU：如果您使用GPU训练，请提供GPU驱动版本、CUDA和cuDNN版本号
+    3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+    4）Python版本号
+    5）显存信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please provide us with the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/cuDNN version
+-OS Platform and Distribution(eg.Mac OS 10.14)
+-Python version 
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
--- a/.github/ISSUE_TEMPLATE/---training-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---training-issue-.md
+---
+name: 训练（Training issue）
+about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题，例如“Insufficient Memory xxx" ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本，例如Mac OS 10.14，Python版本
+- 训练信息
+    1）单机/多机，单卡/多卡
+    2）显存信息
+    3）Operator信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志、可复现的代码片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a training issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Other imformation: Distriuted training/informantion of operator/
+Graphics card storage
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,7 +204,9 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/rocprim)
 include(external/xxhash)    # download xxhash
+include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream


--- a/Dockerfile
+++ b/Dockerfile
@@ -22,6 +22,27 @@ ENV HOME /root
 # Add bash enhancements
 COPY ./paddle/scripts/docker/root/ /root/

+# Prepare packages for Python
+RUN apt-get update && \
+    apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev
+
+# Install Python3.6
+RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
+    wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+    tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null
+
+# Install Python3.7
+RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null
+
 RUN apt-get update && \
    apt-get install -y --allow-downgrades patchelf \
    python3 python3-dev python3-pip \
@@ -74,6 +95,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 RUN pip3 install -U wheel && \
    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 install -U wheel && \
+    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 install -U wheel && \
+    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
    easy_install -U pip && \
    pip install -U pip setuptools wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
@@ -82,22 +109,34 @@ RUN pip3 install -U wheel && \
 RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip3 install opencv-python && \
+    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 install opencv-python && \
+    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 install opencv-python && \
    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python

 #For docstring checker
 RUN pip3 install pylint pytest astroid isort
+RUN pip3.6 install pylint pytest astroid isort
+RUN pip3.7 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker

 COPY ./python/requirements.txt /root/
 RUN pip3 install -r /root/requirements.txt
+RUN pip3.6 install -r /root/requirements.txt
+RUN pip3.7 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt

 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip3 install certifi urllib3[secure]
+RUN pip3.6 install certifi urllib3[secure]
+RUN pip3.7 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]



--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
+include(ExternalProject)
+
+set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack)
+set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include)
+
+include_directories(${DLPACK_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_dlpack
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/dmlc/dlpack.git"
+  GIT_TAG        "v0.2"
+  PREFIX         ${DLPACK_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(dlpack STATIC ${dummyfile})
+else()
+  add_library(dlpack INTERFACE)
+endif()
+
+add_dependencies(dlpack extern_dlpack)
+
+LIST(APPEND externl_project_dependencies dlpack)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -53,7 +53,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "21fb5f2af1dd14e132af4f1b79160977ee487818"
+    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/external/rocprim.cmake
+++ b/cmake/external/rocprim.cmake
+if (NOT WITH_AMD_GPU)
+    return()
+endif()
+
+# rocprim is "ROCm Parallel Primitives" for short.
+# It is a header-only library providing HIP and HC parallel primitives
+# for developing performant GPU-accelerated code on AMD ROCm platform.
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+INCLUDE(ExternalProject)
+
+SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
+SET(ROCPRIM_INSTALL_DIR  ${THIRD_PARTY_PATH}/install/rocprim)
+SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
+
+ExternalProject_Add(
+    extern_rocprim
+    GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
+    GIT_TAG        5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc 
+    PREFIX         ${ROCPRIM_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS     -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
+    CMAKE_ARGS     -DONLY_INSTALL=ON
+    CMAKE_ARGS     -DBUILD_TEST=OFF
+    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
+
+    INSTALL_DIR    ${ROCPRIM_INSTALL_DIR}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+)
+
+INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";")
+    add_library(rocprim STATIC ${dummyfile})
+else()
+    add_library(rocprim INTERFACE)
+endif()
+
+add_dependencies(rocprim extern_rocprim)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
    -Wno-error=parentheses-equality # Warnings in pybind11
    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
+    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
+    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
+    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
 )

 set(GPU_COMMON_FLAGS

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -461,25 +461,29 @@ function(hip_library TARGET_NAME)
      else()
        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
-	find_fluid_modules(${TARGET_NAME})
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
+        find_fluid_modules(${TARGET_NAME})
      endif()
-      if (hip_library_DEPS)
-	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
-	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+        list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
      endif()
      # cpplint code style
      foreach(source_file ${hip_library_SRCS})
-	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	endif()
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
      endforeach()
    else(hip_library_SRCS)
      if (hip_library_DEPS)
-	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+        merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
      else()
-	message(FATAL "Please specify source file or library in nv_library.")
+        message(FATAL "Please specify source file or library in nv_library.")
      endif()
    endif(hip_library_SRCS)
  endif()

--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
 endif()

 include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hip/include")
+include_directories("/opt/rocm/miopen/include")
 include_directories("/opt/rocm/hipblas/include")
 include_directories("/opt/rocm/hiprand/include")
 include_directories("/opt/rocm/rocrand/include")
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")

 list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")

-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )

 if(WITH_DSO)
  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
 endif(WITH_DSO)

-if(WITH_DOUBLE)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
-endif(WITH_DOUBLE)
-
 if(WITH_TESTING)
  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
 endif(WITH_TESTING)

+if(WITH_DISTRIBUTE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
+endif(WITH_DISTRIBUTE)
+
+if(WITH_GRPC)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
+endif(WITH_GRPC)
+
+if(NOT WITH_GOLANG)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
+endif(NOT WITH_GOLANG)
+
+if(WITH_MKLDNN)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
+endif(WITH_MKLDNN)
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
+
+if(NOT WITH_RDMA)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
+endif(NOT WITH_RDMA)
+
+
+
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
-paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
+paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None))
 paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
 paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
@@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
 paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
 paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
-paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
+paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None))
 paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
 paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -116,8 +116,9 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

+cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@@ -192,3 +193,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
+
+cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
+cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+namespace internal {
+template <typename T>
+static ::DLDataType GetDLDataTypeCode() {
+  ::DLDataType dtype;
+  if (std::is_same<T, platform::float16>::value ||
+      std::is_floating_point<T>::value) {
+    dtype.code = kDLFloat;
+  } else if (std::is_unsigned<T>::value) {
+    dtype.code = kDLUInt;
+  } else if (std::is_integral<T>::value) {
+    dtype.code = kDLInt;
+  } else {
+    PADDLE_THROW("Unsupported data type %s", typeid(T).name());
+  }
+  dtype.bits = 8 * sizeof(T);
+  dtype.lanes = 1;
+  return dtype;
+}
+
+static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) {
+#define REG_DL_DATA_TYPE(type) \
+  { std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
+  static const std::unordered_map<std::type_index, ::DLDataType>
+      type_to_dtype_map({
+          REG_DL_DATA_TYPE(platform::float16),  // NOLINT
+          REG_DL_DATA_TYPE(float),              // NOLINT
+          REG_DL_DATA_TYPE(double),             // NOLINT
+          REG_DL_DATA_TYPE(int),                // NOLINT
+          REG_DL_DATA_TYPE(int64_t),            // NOLINT
+          REG_DL_DATA_TYPE(bool),               // NOLINT
+          REG_DL_DATA_TYPE(size_t),             // NOLINT
+          REG_DL_DATA_TYPE(int16_t),            // NOLINT
+          REG_DL_DATA_TYPE(uint8_t),            // NOLINT
+          REG_DL_DATA_TYPE(int8_t)              // NOLINT
+      });
+  static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
+  auto it = type_to_dtype_map.find(type);
+  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s",
+                 type.name());
+  return it->second;
+#undef REG_DL_DATA_TYPE
+}
+
+struct DLContextVisitor : public boost::static_visitor<::DLContext> {
+  inline ::DLContext operator()(const platform::CPUPlace &place) const {
+    DLContext ctx;
+    ctx.device_type = kDLCPU;
+    ctx.device_id = 0;
+    return ctx;
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLGPU;
+    ctx.device_id = place.device;
+    return ctx;
+#else
+    PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version");
+#endif
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLCPUPinned;
+    ctx.device_id = 0;
+    return ctx;
+#else
+    PADDLE_THROW(
+        "platform::CUDAPinnedPlace is not supported in CPU only version");
+#endif
+  }
+};
+}  // namespace internal
+
+DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
+  // init data, data buffer
+  t_.data = const_cast<void *>(tensor.data<void>());
+
+  // init ctx, DLContext type with device_type and device_id
+  auto place = tensor.place();
+  t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place);
+
+  // init dtype
+  t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
+  t_.dtype.lanes = lanes;
+
+  // init ndim, tensor rank
+  auto &dims = tensor.dims();
+  using DimType = decltype(t_.ndim);  // int
+  t_.ndim = static_cast<DimType>(dims.size());
+
+  // init shape, tensor dims
+  t_.shape = shape_;
+  for (DimType i = 0; i < t_.ndim; ++i) {
+    t_.shape[i] = dims[i];
+  }
+
+  // init strides, nullptr means the tensor is compact
+  t_.strides = nullptr;
+
+  // init byte_offset
+  t_.byte_offset = 0;
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlpack/dlpack.h>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class DLPackTensor {
+ public:
+  using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
+  using ShapeType =
+      std::remove_reference<decltype(::DLTensor::shape[0])>::type;  // int64_t
+
+  // lanes is only used in CPU to enable vectorization
+  explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1);
+
+  inline operator const ::DLTensor&() const { return t_; }
+
+  inline operator ::DLTensor&() { return t_; }
+
+ private:
+  ::DLTensor t_;
+
+  // The shape in DLTensor is defined as int64_t*
+  // Add this member to make TVMTensor init without heap allocation
+  ShapeType shape_[9];
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+namespace {  // NOLINT
+template <typename T>
+constexpr uint8_t GetDLDataTypeCode() {
+  return std::is_same<platform::float16, T>::value ||
+                 std::is_floating_point<T>::value
+             ? static_cast<uint8_t>(kDLFloat)
+             : (std::is_unsigned<T>::value
+                    ? static_cast<uint8_t>(kDLUInt)
+                    : (std::is_integral<T>::value ? static_cast<uint8_t>(kDLInt)
+                                                  : static_cast<uint8_t>(-1)));
+}
+}  // NOLINT
+
+template <typename T>
+void TestMain(const platform::Place &place, uint16_t lanes) {
+  DDim dims{4, 5, 6, 7};
+  Tensor tensor;
+  tensor.Resize(dims);
+  void *p = tensor.mutable_data<T>(place);
+
+  DLPackTensor dlpack_tensor(tensor, lanes);
+  ::DLTensor &dl_tensor = dlpack_tensor;
+
+  CHECK_EQ(p, dl_tensor.data);
+  if (platform::is_cpu_place(place)) {
+    CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else if (platform::is_gpu_place(place)) {
+    CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(boost::get<platform::CUDAPlace>(place).device,
+             dl_tensor.ctx.device_id);
+  } else if (platform::is_cuda_pinned_place(place)) {
+    CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else {
+    CHECK_EQ(false, true);
+  }
+
+  CHECK_EQ(dims.size(), dl_tensor.ndim);
+  for (auto i = 0; i < dims.size(); ++i) {
+    CHECK_EQ(dims[i], dl_tensor.shape[i]);
+  }
+
+  CHECK_EQ(dl_tensor.strides == nullptr, true);
+  CHECK_EQ(static_cast<uint64_t>(0), dl_tensor.byte_offset);
+
+  CHECK_EQ(lanes, dl_tensor.dtype.lanes);
+  CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits);
+
+  CHECK_EQ(GetDLDataTypeCode<T>(), dl_tensor.dtype.code);
+}
+
+template <typename T>
+void TestMainLoop() {
+#ifdef PADDLE_WITH_CUDA
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CUDAPlace(0),
+                                      platform::CUDAPinnedPlace()};
+  if (platform::GetCUDADeviceCount() > 1) {
+    places.emplace_back(platform::CUDAPlace(1));
+  }
+#else
+  std::vector<platform::Place> places{platform::CPUPlace()};
+#endif
+  std::vector<uint16_t> lanes{1, 2};
+  for (auto &p : places) {
+    for (auto &l : lanes) {
+      TestMain<T>(p, l);
+    }
+  }
+}
+
+#define PADDLE_DLPACK_TEST(type) \
+  TEST(dlpack, test_##type) { TestMainLoop<type>(); }
+
+using float16 = platform::float16;
+PADDLE_DLPACK_TEST(float16);
+PADDLE_DLPACK_TEST(float);
+PADDLE_DLPACK_TEST(double);
+PADDLE_DLPACK_TEST(int);
+PADDLE_DLPACK_TEST(int64_t);
+PADDLE_DLPACK_TEST(bool);
+PADDLE_DLPACK_TEST(size_t);
+PADDLE_DLPACK_TEST(int16_t);
+PADDLE_DLPACK_TEST(uint8_t);
+PADDLE_DLPACK_TEST(int8_t);
+
+#undef PADDLE_DLPACK_TEST
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -391,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,

  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to false
-  // WhileGradOp would need the scopes created in WhileOp
+  // WhileOp would set keep_kids to true,
+  // because WhileGradOp needs the scopes created in WhileOp.
  // Perhaps, we should not perform eager deletion in WhileOp
  // The scopes and variables created by WhileOp would be deleted
  // in WhileGradOp.

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
  for (auto &op : ops_) {
    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
            << " on scope " << scope_;
+    op->SetIsCalledByExecutor(false);
    op->Run(*scope_, place_);
  }
 }

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
        this->attrs_[name] = std::vector<int>();
        break;
      }
+      case proto::AttrType::LONGS: {
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from LONGS to LONGS";
+        this->attrs_[name] = std::vector<int64_t>();
+        break;
+      }
      case proto::AttrType::FLOATS: {
        VLOG(110) << "SetAttr: " << Type() << ", " << name
                  << " from INTS to FLOATS";

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {

-// Combine two hash values to a single hash.
-inline size_t CombineHash(size_t seed, size_t a) {
-  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -794,17 +790,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
    const Scope& scope, const OpKernelType& expected_kernel_key,
    std::vector<std::string>* transfered_inplace_vars) const {
-// In the inference scenerio, the scopes will be reused across the batches, so
-// the `new_scope` here will result in GPU memroy explosion over the running of
-// operators.
-// We use a thread_local cache to fix that issue, the key in the cache is the
-// combination of the `scope` argument, from_kernel_type, target_kernel_type.
-// Have a discussion with @Superjomn or the inference developers if some changes
-// on this logic for this macro might not tested on the other scenerios.
-#ifdef PADDLE_ON_INFERENCE
-  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
-#endif
-
  Scope* new_scope = nullptr;
  for (auto& var_name_item : Inputs()) {
    for (auto& var_name : var_name_item.second) {
@@ -835,23 +820,23 @@ Scope* OperatorWithKernel::TryTransferData(
      VLOG(30) << "Transform Variable " << var_name << " from "
               << kernel_type_for_var << " to " << expected_kernel_key;

-#ifdef PADDLE_ON_INFERENCE
-      size_t infer_cache_key =
-          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
-                      OpKernelType::Hash()(expected_kernel_key));
-      infer_cache_key =
-          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
-
-      auto it = infer_transfer_scope_cache.find(infer_cache_key);
-      if (it != infer_transfer_scope_cache.end()) {
-        new_scope = infer_transfer_scope_cache[infer_cache_key];
-      } else {
-        new_scope = &scope.NewScope();
-        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      // In the inference scenerio, the scopes will be reused across the
+      // batches, so the `new_scope` here will result in GPU memroy explosion
+      // over the  running of operators.
+      // We use a thread_local cache to fix that issue, the key in the cache is
+      // the combination of the `scope` argument, from_kernel_type,
+      // target_kernel_type.
+      // Have a discussion with @Superjomn or the inference developers if some
+      // changes on this logic for this macro might not tested on the other
+      // scenerios.
+      // If this op is not called by an Executor or ParallelExecutor, it should
+      // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
+      // variables, that behavior a lot different.
+      if (!run_by_executor_) {
+        new_scope = TryCreateTransferScope(kernel_type_for_var,
+                                           expected_kernel_key, &scope);
      }
-#endif
-
-      if (new_scope == nullptr) {
+      if (!new_scope) {
        new_scope = &scope.NewScope();
      }


--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -127,6 +127,8 @@ class OperatorBase {
  //! Get all outputs variable names
  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;

+  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+
 protected:
  std::string type_;
  // NOTE: in case of OpGrad, inputs_ contains:
@@ -139,6 +141,8 @@ class OperatorBase {
  // IG (Inputs Gradients)
  VariableNameMap outputs_;
  AttributeMap attrs_;
+  // Whether this operator executes in an Executor.
+  bool run_by_executor_{true};

 private:
  void GenerateTemporaryNames();

--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/transfer_scope_cache.h"
+
+namespace paddle {
+namespace framework {
+
+std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
+  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
+  return *x;
+}
+
+std::unordered_set<Scope*>& global_transfer_scope_cache() {
+  thread_local auto* x = new std::unordered_set<Scope*>;
+  return *x;
+}
+
+Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
+                              const Scope* scope) {
+  Scope* new_scope{nullptr};
+  size_t infer_cache_key =
+      CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1));
+  infer_cache_key =
+      CombineHash(infer_cache_key, std::hash<const Scope*>()(scope));
+
+  auto it = global_transfer_data_cache().find(infer_cache_key);
+  if (it != global_transfer_data_cache().end()) {
+    new_scope = global_transfer_data_cache()[infer_cache_key];
+  } else {
+    new_scope = &scope->NewScope();
+    global_transfer_data_cache()[infer_cache_key] = new_scope;
+  }
+  global_transfer_scope_cache().insert(new_scope);
+  return new_scope;
+}
+
+void RemoveKidsFromTransferScopeCache(Scope* scope) {
+  auto it = global_transfer_scope_cache().find(scope);
+  if (it != global_transfer_scope_cache().end()) {
+    global_transfer_scope_cache().erase(it);
+  }
+  for (auto* s : scope->kids()) {
+    auto it = global_transfer_scope_cache().find(s);
+    if (it != global_transfer_scope_cache().end()) {
+      global_transfer_scope_cache().erase(it);
+    }
+  }
+
+  // remove global transfer data cache
+  auto& cache = global_transfer_data_cache();
+  for (auto it = cache.begin(); it != cache.end();) {
+    if (it->second == scope)
+      it = cache.erase(it);
+    else
+      it++;
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/transfer_scope_cache.h
+++ b/paddle/fluid/framework/transfer_scope_cache.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
+
+std::unordered_set<Scope*>& global_transfer_scope_cache();
+
+// Combine two hash values to a single hash.
+static size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
+                              const Scope* scope);
+
+void RemoveKidsFromTransferScopeCache(Scope* scope);
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -4,6 +4,7 @@ endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
+add_subdirectory(utils)
 if (TENSORRT_FOUND)
  add_subdirectory(tensorrt)
 endif()

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
+           lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
+           analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)

 cc_test(test_paddle_inference_api
        SRCS api_tester.cc

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  prog_file = other.prog_file;
  param_file = other.param_file;
  specify_input_name = other.specify_input_name;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
  // fields from this.
  enable_ir_optim = other.enable_ir_optim;
  use_feed_fetch_ops = other.use_feed_fetch_ops;
@@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
  prog_file = other.prog_file;
  param_file = other.param_file;
  specify_input_name = other.specify_input_name;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
  // fields from this.
  enable_ir_optim = other.enable_ir_optim;
  use_feed_fetch_ops = other.use_feed_fetch_ops;

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -31,11 +31,11 @@
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #endif
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(profile);
-DECLARE_int32(paddle_num_threads);

 namespace paddle {

@@ -65,7 +65,7 @@ bool AnalysisPredictor::Init(
  }

  // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());

  if (!PrepareScope(parent_scope)) {
    return false;
@@ -158,6 +158,14 @@ bool AnalysisPredictor::PrepareExecutor() {
  return true;
 }

+void AnalysisPredictor::SetMkldnnThreadID(int tid) {
+#ifdef PADDLE_WITH_MKLDNN
+  platform::set_cur_thread_id(tid);
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+#endif
+}
+
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                            std::vector<PaddleTensor> *output_data,
                            int batch_size) {
@@ -165,7 +173,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
  inference::Timer timer;
  timer.tic();
  // set feed variable
-  std::vector<framework::LoDTensor> feeds;
  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
  if (!SetFeed(inputs, scope)) {
    LOG(ERROR) << "fail to set feed";
@@ -206,17 +213,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<int64_t>(ddim, place_);
    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<float>(ddim, place_);
    } else {
      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
      return false;
    }

-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                inputs[i].data.length());
+    if (platform::is_cpu_place(place_)) {
+      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                  inputs[i].data.length());
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
+      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length(),
+                   0);  // stream 0 for sync copy
+#else
+      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+    }
    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
    framework::LoD lod;
    for (auto &level : inputs[i].lod) {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
  framework::Scope *scope() { return scope_.get(); }
  framework::ProgramDesc &program() { return *inference_program_; }

+  void SetMkldnnThreadID(int tid);
+
 protected:
  bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
  bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -24,11 +24,11 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_bool(profile, false, "Turn on profiler for fluid");
-DECLARE_int32(paddle_num_threads);

 namespace paddle {
 namespace {
@@ -74,7 +74,7 @@ bool NativePaddlePredictor::Init(
  }

  // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());

  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
@@ -135,7 +135,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  Timer timer;
  timer.tic();
  // set feed variable
-  std::vector<framework::LoDTensor> feeds;
  framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
  if (!SetFeed(inputs, scope)) {
    LOG(ERROR) << "fail to set feed";
@@ -191,17 +190,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<int64_t>(ddim, place_);
    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<float>(ddim, place_);
    } else {
      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
      return false;
    }

-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                inputs[i].data.length());
+    if (platform::is_cpu_place(place_)) {
+      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                  inputs[i].data.length());
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
+      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length(),
+                   0);  // stream 0 for sync copy
+#else
+      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+    }
+
    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
    framework::LoD lod;
    for (auto &level : inputs[i].lod) {

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -46,8 +46,6 @@ if(WITH_GPU)
    endif()
  endif(NOT WIN32)
 endif()
-
-include_directories("D:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
                            int max_batch_size = 1);
  bool use_tensorrt() const { return use_tensorrt_; }

+  void EnableMKLDNN();
  // NOTE this is just for internal development, please not use it.
  // NOT stable yet.
-  void EnableMKLDNN();
  bool use_mkldnn() const { return use_mkldnn_; }

  friend class ::paddle::AnalysisPredictor;

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
  // Specify the variable's name of each input if input tensors don't follow the
  // `feeds` and `fetches` of the phase `save_inference_model`.
  bool specify_input_name{false};
+
+  // Set and get the number of cpu math library threads.
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
+    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+ protected:
+  // number of cpu math library (such as MKL, OpenBlas) threads for each
+  // instance.
+  int cpu_math_library_num_threads_{1};
 };

 // A factory to help create different predictors.

--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -19,9 +19,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-/*
- * SplitOp.
- */
 class SplitOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
@@ -40,16 +37,11 @@ class SplitOpConverter : public OpConverter {
    int axis = boost::get<int>(op_desc.GetAttr("axis"));
    std::vector<int> output_lengths =
        boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
+    // split on batch is not supported in TensorRT
    PADDLE_ENFORCE(axis != 0);
-    if (axis < 0) {
-      axis += input_dims.nbDims;
-    } else {
-      axis -= 1;
-    }
+    axis += (axis < 0) ? input_dims.nbDims : -1;

    PADDLE_ENFORCE(output_lengths.size() == output_num);
-
-    //
    plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
    nvinfer1::IPluginLayer* layer =
        engine_->AddPlugin(&input, input_num, plugin);

--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -20,30 +20,92 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-TEST(split_op, test) {
+template <int BatchSize, int Axis>
+void TensorRTSplitTest(const std::vector<int> &in_shape,
+                       const std::vector<int> &sections) {
  std::unordered_set<std::string> parameters({""});
  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
-  validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
+  TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000);
+
+  auto make_dim = [](const std::vector<int> &shape) {
+    nvinfer1::DimsCHW dim;
+    dim.c() = shape[0];
+    dim.h() = shape[1];
+    dim.w() = shape[2];
+    return dim;
+  };
+  validator.DeclInputVar("split_input", make_dim(in_shape));
+  std::vector<std::string> output_vars;
+  for (size_t i = 0; i < sections.size(); ++i) {
+    auto out_shape = in_shape;
+    out_shape[Axis - 1] = sections[i];
+    std::string output_name = "split_out" + std::to_string(i);
+    validator.DeclOutputVar(output_name, make_dim(out_shape));
+    output_vars.push_back(output_name);
+  }

  // Prepare Op description
  framework::OpDesc desc;
  desc.SetType("split");
  desc.SetInput("X", {"split_input"});
-  desc.SetOutput("Out", {"split_out1", "split_out2"});
+  desc.SetOutput("Out", output_vars);

-  int num = 0;
-  int axis = 1;
-  std::vector<int> output_lengths = {2, 1};
-  desc.SetAttr("axis", axis);
-  desc.SetAttr("num", num);
-  desc.SetAttr("sections", output_lengths);
+  desc.SetAttr("axis", Axis);
+  desc.SetAttr("num", 0);
+  desc.SetAttr("sections", sections);

  validator.SetOp(*desc.Proto());

-  validator.Execute(1);
+  validator.Execute(BatchSize);
+}
+
+// batch = 0, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch1) {
+  TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
+}
+// batch = 0, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch1) {
+  TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
+}
+// batch = 10, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch10) {
+  TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
+}
+// batch = 10, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch10) {
+  TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
+}
+// batch = 0, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 0, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 10, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 10, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 0, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 0, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1});
+}
+// batch = 10, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 10, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1});
 }

 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cuda_fp16.h>
+#include <algorithm>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"

 namespace paddle {
@@ -19,6 +21,52 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+// copied from operators::math::SplitFunctor
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int* out_cols,
+                            int out_cols_size, T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int fixed_out_col,
+                            T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
 nvinfer1::Dims SplitPlugin::getOutputDimensions(
    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
  PADDLE_ENFORCE_EQ(num_inputs, 1);
@@ -31,48 +79,96 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(

 int SplitPlugin::initialize() {
  PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
-
+  // notice input dims is [C, H, W]
+  nvinfer1::Dims dims = this->getInputDims(0);
+  outer_rows_ = 1;
+  inner_cols_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    outer_rows_ *= dims.d[i];
+  }
+  for (int i = axis_ + 1; i < dims.nbDims; ++i) {
+    inner_cols_ *= dims.d[i];
+  }
+  same_shape_ = true;
  std::vector<int> segment_offsets(1, 0);
  for (int i = 0; i < this->getNbOutputs(); ++i) {
-    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
+    if (output_length_[i] != output_length_[0]) {
+      same_shape_ = false;
+    }
+    segment_offsets.push_back(segment_offsets.back() +
+                              output_length_[i] * inner_cols_);
  }
-  segment_offsets_ = segment_offsets;
-  nvinfer1::Dims dims = this->getInputDims(0);
-  nx_ = 1;
-  for (int i = dims.nbDims - 1; i > axis_; --i) {
-    nx_ *= dims.d[i];
+  inner_cols_ *= dims.d[axis_];
+  d_segment_offsets_ = segment_offsets;
+  segment_offsets_ = std::move(segment_offsets);
+  d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
+  return 0;
+}
+
+template <typename T>
+inline void Split(cudaStream_t stream, const bool same_shape,
+                  const int outer_rows, const int inner_cols,
+                  const std::vector<int>& segment_offsets,
+                  const int* d_segment_offsets, const T* input, T** outputs) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaxBlocks = 65535;
+  int block_cols = kThreadsPerBlock;
+  if (inner_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((inner_cols + 31) >> 5) << 5;
  }
-  ny_ = dims.d[axis_];
-  nz_ = 1;
-  for (int i = axis_ - 1; i >= 0; --i) {
-    nz_ *= dims.d[i];
+  int block_rows = kThreadsPerBlock / block_cols;
+  dim3 block_size = dim3(block_cols, block_rows, 1);
+
+  int grid_cols =
+      std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks);
+  int grid_rows =
+      std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1));
+  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+  if (same_shape) {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, segment_offsets[1], outputs);
+  } else {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, d_segment_offsets,
+        static_cast<int>(segment_offsets.size()), outputs);
  }
-  return 0;
 }

 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                         void** outputs, void* workspace, cudaStream_t stream) {
-  auto const& input_dims = this->getInputDims(0);
-  int input_size = 0;
-  float const* idata = reinterpret_cast<float const*>(inputs[0]);
-  float** odatas = reinterpret_cast<float**>(outputs);
-
-  // kernel impl here.
-  int inputBatchOffset = nx_ * ny_ * nz_;
-  for (size_t i = 0; i < this->getNbOutputs(); i++) {
-    for (size_t j = 0; j < batchSize; j++) {
-      cudaMemcpyAsync(
-          odatas[i] +
-              j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
-                  sizeof(float),
-          inputs[0] +
-              (inputBatchOffset * j + segment_offsets_[i] * nx_) *
-                  sizeof(float),
-          (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
-          cudaMemcpyDeviceToDevice, stream);
+  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
+  if (((batchSize == 1 && axis_ == 0) || axis_ == -1) &&
+      this->getNbOutputs() < 10) {
+    float** output_ptrs = reinterpret_cast<float**>(outputs);
+    int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
+                             ? sizeof(float)
+                             : sizeof(__half);
+    for (int i = 0; i < this->getNbOutputs(); ++i) {
+      PADDLE_ENFORCE(
+          cudaMemcpyAsync(
+              output_ptrs[i], input_ptr + segment_offsets_[i],
+              (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size,
+              cudaMemcpyDeviceToDevice, stream) == cudaSuccess);
+    }
+  } else {
+    outer_rows_ *= batchSize;
+    const int* d_segment_offsets_ptr =
+        thrust::raw_pointer_cast(&d_segment_offsets_[0]);
+    float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
+    PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
+                                   this->getNbOutputs() * sizeof(float*),
+                                   cudaMemcpyHostToDevice,
+                                   stream) == cudaSuccess);
+    if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, input_ptr, output_ptrs);
+    } else {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, (__half*)input_ptr,  // NOLINT
+            (__half**)output_ptrs);                     // NOLINT
    }
  }
-
  return cudaGetLastError() != cudaSuccess;
 }


--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <thrust/device_vector.h>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

@@ -25,7 +26,7 @@ namespace plugin {
 class SplitPlugin : public PluginTensorRT {
 public:
  SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), output_length_(output_lengths) {}
+      : axis_(axis), same_shape_(true), output_length_(output_lengths) {}

  SplitPlugin(void const *serial_data, size_t serial_length) {
    deserializeBase(serial_data, serial_length);
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
  }

  int axis_;
+  int outer_rows_;
+  int inner_cols_;
+  bool same_shape_;
  std::vector<int> output_length_;
-  int nx_, ny_, nz_;
  std::vector<int> segment_offsets_;
+  thrust::device_vector<int> d_segment_offsets_;
+  thrust::device_vector<float *> d_output_ptrs_;
 };

 }  // namespace plugin

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
-  inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)

@@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet

 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
-   # anakin rnn1
-   set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
-   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
-   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
-   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
-   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
-           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
-                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
-           DEPS inference_anakin_api_shared SERIAL)
-   # anakin mobilenet
-   if(WITH_GPU)
-       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
-       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
-       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
-               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
-               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
-   endif()
+    # anakin rnn1
+    set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
+    set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
+    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
+    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
+    cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
+            ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
+                 --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
+            DEPS inference_anakin_api_shared SERIAL)
+    # anakin mobilenet
+    if(WITH_GPU)
+        set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
+        inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
+        cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
+                ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+    endif()
 endif()

 if(WITH_GPU AND TENSORRT_FOUND)
-   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
-   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
-       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
-   endif()
-   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
-      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
+    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+    endif()
+    inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
  cfg->device = 0;
  cfg->enable_ir_optim = true;
  cfg->specify_input_name = true;
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
  os << GenSpaces(num_spaces)
     << "specify_input_name: " << config.specify_input_name << "\n";
+  os << GenSpaces(num_spaces)
+     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
  num_spaces--;
  os << GenSpaces(num_spaces) << "}\n";
  return os;

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
            "Running the inference program in analysis mode.");

 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);

 namespace paddle {
 namespace inference {
@@ -204,22 +205,23 @@ void TestMultiThreadPrediction(
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  std::vector<std::thread> threads;
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
-  for (int tid = 1; tid < num_threads; ++tid) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
+  auto main_predictor = CreateTestPredictor(config, use_analysis);

  size_t total_time{0};
  for (int tid = 0; tid < num_threads; ++tid) {
    threads.emplace_back([&, tid]() {
-#ifdef PADDLE_WITH_MKLDNN
-      platform::set_cur_thread_id(static_cast<int>(tid) + 1);
-#endif
      // Each thread should have local inputs and outputs.
      // The inputs of each thread are all the same.
      std::vector<PaddleTensor> outputs_tid;
-      auto &predictor = predictors[tid];
+      // To ensure the thread binding correctly,
+      // please clone inside the threadpool.
+      auto predictor = main_predictor->Clone();
+#ifdef PADDLE_WITH_MKLDNN
+      if (use_analysis) {
+        static_cast<AnalysisPredictor *>(predictor.get())
+            ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
+      }
+#endif

      // warmup run
      LOG(INFO) << "Running thread " << tid << ", warm up run...";

--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
+cc_library(benchmark SRCS benchmark.cc DEPS enforce)
+cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/benchmark.h"
+#include <sstream>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+std::string Benchmark::SerializeToString() const {
+  std::stringstream ss;
+  ss << "-----------------------------------------------------\n";
+  ss << "name\t";
+  ss << "batch_size\t";
+  ss << "num_threads\t";
+  ss << "latency\t";
+  ss << "qps";
+  ss << '\n';
+
+  ss << name_ << "\t";
+  ss << batch_size_ << "\t";
+  ss << num_threads_ << "\t";
+  ss << latency_ << "\t";
+  ss << 1000 / latency_;
+  ss << '\n';
+  return ss.str();
+}
+void Benchmark::PersistToFile(const std::string &path) const {
+  std::ofstream file(path, std::ios::app);
+  PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path);
+  file << SerializeToString();
+  file.flush();
+  file.close();
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+
+namespace paddle {
+namespace inference {
+
+/*
+ * Helper class to calculate the performance.
+ */
+struct Benchmark {
+  int batch_size() const { return batch_size_; }
+  void SetBatchSize(int x) { batch_size_ = x; }
+
+  int num_threads() const { return num_threads_; }
+  void SetNumThreads(int x) { num_threads_ = x; }
+
+  bool use_gpu() const { return use_gpu_; }
+  void SetUseGpu() { use_gpu_ = true; }
+
+  int latency() const { return latency_; }
+  void SetLatency(int x) { latency_ = x; }
+
+  const std::string& name() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+  std::string SerializeToString() const;
+  void PersistToFile(const std::string& path) const;
+
+ private:
+  bool use_gpu_{false};
+  int batch_size_{0};
+  int latency_;
+  int num_threads_{1};
+  std::string name_;
+};
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/benchmark.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+using namespace paddle::inference;
+TEST(Benchmark, basic) {
+  Benchmark benchmark;
+  benchmark.SetName("key0");
+  benchmark.SetBatchSize(10);
+  benchmark.SetUseGpu();
+  benchmark.SetLatency(220);
+  LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString();
+}
+
+TEST(Benchmark, PersistToFile) {
+  Benchmark benchmark;
+  benchmark.SetName("key0");
+  benchmark.SetBatchSize(10);
+  benchmark.SetUseGpu();
+  benchmark.SetLatency(220);
+
+  benchmark.PersistToFile("1.log");
+  benchmark.PersistToFile("1.log");
+  benchmark.PersistToFile("1.log");
+}
\ No newline at end of file
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
-detection_library(density_prior_box_op SRCS density_prior_box_op.cc)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc

--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
    auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
    auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
    auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
+    bool flatten = ctx->Attrs().Get<bool>("flatten_to_2d");

    PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
                      "The number of fixed_sizes and densities must be equal.");
    size_t num_priors = 0;
-    if ((fixed_sizes.size() > 0) && (densities.size() > 0)) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    if (!flatten) {
+      std::vector<int64_t> dim_vec(4);
+      dim_vec[0] = input_dims[2];
+      dim_vec[1] = input_dims[3];
+      dim_vec[2] = num_priors;
+      dim_vec[3] = 4;
+      ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+      ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+    } else {
+      int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
+      ctx->SetOutputDim("Boxes", {dim0, 4});
+      ctx->SetOutputDim("Variances", {dim0, 4});
    }
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
  }

 protected:
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.GetPlace());
  }
 };

@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
        .SetDefault(true);
-
+    AddAttr<bool>("flatten_to_2d",
+                  "(bool) Whether to flatten to 2D and "
+                  "the second dim is 4.")
+        .SetDefault(false);
    AddAttr<float>(
        "step_w",
        "Density prior boxes step across width, 0.0 for auto calculation.")

--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/density_prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static __device__ inline T Clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+static __global__ void GenDensityPriorBox(
+    const int height, const int width, const int im_height, const int im_width,
+    const T offset, const T step_width, const T step_height,
+    const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
+    const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
+  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int gidy = blockIdx.y * blockDim.y + threadIdx.y;
+  int step_x = blockDim.x * gridDim.x;
+  int step_y = blockDim.y * gridDim.y;
+
+  const T* width_ratio = ratios_shift;
+  const T* height_ratio = ratios_shift + num_priors;
+  const T* width_shift = ratios_shift + 2 * num_priors;
+  const T* height_shift = ratios_shift + 3 * num_priors;
+
+  for (int j = gidy; j < height; j += step_y) {
+    for (int i = gidx; i < width * num_priors; i += step_x) {
+      int h = j;
+      int w = i / num_priors;
+      int k = i % num_priors;
+
+      T center_x = (w + offset) * step_width;
+      T center_y = (h + offset) * step_height;
+
+      T center_x_temp = center_x + width_shift[k];
+      T center_y_temp = center_y + height_shift[k];
+
+      T box_width_ratio = width_ratio[k] / 2.;
+      T box_height_ratio = height_ratio[k] / 2.;
+
+      T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.);
+      T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.);
+      T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.);
+      T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.);
+
+      int out_offset = (j * width * num_priors + i) * 4;
+      out[out_offset] = is_clip ? Clip<T>(xmin) : xmin;
+      out[out_offset + 1] = is_clip ? Clip<T>(ymin) : ymin;
+      out[out_offset + 2] = is_clip ? Clip<T>(xmax) : xmax;
+      out[out_offset + 3] = is_clip ? Clip<T>(ymax) : ymax;
+
+      var[out_offset] = var_xmin;
+      var[out_offset + 1] = var_ymin;
+      var[out_offset + 2] = var_xmax;
+      var[out_offset + 3] = var_ymax;
+    }
+  }
+}
+
+template <typename T>
+class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto is_clip = ctx.Attr<bool>("clip");
+
+    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
+    auto densities = ctx.Attr<std::vector<int>>("densities");
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = 0;
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+    framework::Tensor h_temp;
+    T* tdata = h_temp.mutable_data<T>({num_priors * 4}, platform::CPUPlace());
+    int idx = 0;
+    for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+      auto fixed_size = fixed_sizes[s];
+      int density = densities[s];
+      for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+        float ar = fixed_ratios[r];
+        int shift = step_average / density;
+        float box_width_ratio = fixed_size * sqrt(ar);
+        float box_height_ratio = fixed_size / sqrt(ar);
+        for (int di = 0; di < density; ++di) {
+          for (int dj = 0; dj < density; ++dj) {
+            float center_x_temp = shift / 2. + dj * shift - step_average / 2.;
+            float center_y_temp = shift / 2. + di * shift - step_average / 2.;
+            tdata[idx] = box_width_ratio;
+            tdata[num_priors + idx] = box_height_ratio;
+            tdata[2 * num_priors + idx] = center_x_temp;
+            tdata[3 * num_priors + idx] = center_y_temp;
+            idx++;
+          }
+        }
+      }
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor d_temp;
+    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+
+    // At least use 32 threads, at most 512 threads.
+    // blockx is multiple of 32.
+    int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
+    int gridx = (feature_width * num_priors + blockx - 1) / blockx;
+    dim3 threads(blockx, 1);
+    dim3 grids(gridx, feature_height);
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    GenDensityPriorBox<T><<<grids, threads, 0, stream>>>(
+        feature_height, feature_width, img_height, img_width, offset,
+        step_width, step_height, num_priors, d_temp.data<T>(), is_clip,
+        variances[0], variances[1], variances[2], variances[3],
+        boxes->data<T>(), vars->data<T>());
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(density_prior_box,
+                        ops::DensityPriorBoxOpCUDAKernel<float>,
+                        ops::DensityPriorBoxOpCUDAKernel<double>);
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
      step_height = step_h;
    }
    int num_priors = 0;
-    if (fixed_sizes.size() > 0 && densities.size() > 0) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
    }

    boxes->mutable_data<T>(ctx.GetPlace());
    vars->mutable_data<T>(ctx.GetPlace());
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);

+    auto box_dim = vars->dims();
+    boxes->Resize({feature_height, feature_width, num_priors, 4});
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
    int step_average = static_cast<int>((step_width + step_height) * 0.5);

    for (int h = 0; h < feature_height; ++h) {
@@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
          auto fixed_size = fixed_sizes[s];
          int density = densities[s];
          // Generate density prior boxes with fixed ratios.
-          if (fixed_ratios.size() > 0) {
-            for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-              float ar = fixed_ratios[r];
-              int shift = step_average / density;
-              float box_width_ratio = fixed_size * sqrt(ar);
-              float box_height_ratio = fixed_size / sqrt(ar);
-              for (int di = 0; di < density; ++di) {
-                for (int dj = 0; dj < density; ++dj) {
-                  float center_x_temp =
-                      center_x - step_average / 2. + shift / 2. + dj * shift;
-                  float center_y_temp =
-                      center_y - step_average / 2. + shift / 2. + di * shift;
-                  e_boxes(h, w, idx, 0) =
-                      (center_x_temp - box_width_ratio / 2.) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.) / img_width
-                          : 0;
-                  e_boxes(h, w, idx, 1) =
-                      (center_y_temp - box_height_ratio / 2.) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.) / img_height
-                          : 0;
-                  e_boxes(h, w, idx, 2) =
-                      (center_x_temp + box_width_ratio / 2.) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.) / img_width
-                          : 1;
-                  e_boxes(h, w, idx, 3) =
-                      (center_y_temp + box_height_ratio / 2.) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.) / img_height
-                          : 1;
-                  idx++;
-                }
+          for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+            float ar = fixed_ratios[r];
+            int shift = step_average / density;
+            float box_width_ratio = fixed_size * sqrt(ar);
+            float box_height_ratio = fixed_size / sqrt(ar);
+            for (int di = 0; di < density; ++di) {
+              for (int dj = 0; dj < density; ++dj) {
+                float center_x_temp =
+                    center_x - step_average / 2. + shift / 2. + dj * shift;
+                float center_y_temp =
+                    center_y - step_average / 2. + shift / 2. + di * shift;
+                e_boxes(h, w, idx, 0) =
+                    (center_x_temp - box_width_ratio / 2.) / img_width >= 0
+                        ? (center_x_temp - box_width_ratio / 2.) / img_width
+                        : 0;
+                e_boxes(h, w, idx, 1) =
+                    (center_y_temp - box_height_ratio / 2.) / img_height >= 0
+                        ? (center_y_temp - box_height_ratio / 2.) / img_height
+                        : 0;
+                e_boxes(h, w, idx, 2) =
+                    (center_x_temp + box_width_ratio / 2.) / img_width <= 1
+                        ? (center_x_temp + box_width_ratio / 2.) / img_width
+                        : 1;
+                e_boxes(h, w, idx, 3) =
+                    (center_y_temp + box_height_ratio / 2.) / img_height <= 1
+                        ? (center_y_temp + box_height_ratio / 2.) / img_height
+                        : 1;
+                idx++;
              }
            }
          }
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));

    vars->Resize(var_dim);
+    boxes->Resize(box_dim);
  }
 };  // namespace operators


--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"

+DECLARE_bool(rpc_disable_reuse_port);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
  // Channel configurations:
  grpc::ChannelArguments args;
  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
+  if (FLAGS_rpc_disable_reuse_port) {
+    args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
+  }
  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());

--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -20,6 +20,8 @@ limitations under the License. */

 using ::grpc::ServerAsyncResponseWriter;

+DECLARE_bool(rpc_disable_reuse_port);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
  VLOG(40) << "AsyncGRPCServer WaitSeverReady";
 }

+// Define an option subclass in order to disable SO_REUSEPORT for the
+// server socket.
+// Come from:
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+class NoReusePortOption : public ::grpc::ServerBuilderOption {
+ public:
+  void UpdateArguments(::grpc::ChannelArguments* args) override {
+    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
+  }
+
+  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
+                         plugins) override {}
+};
+
 void AsyncGRPCServer::StartServer() {
  ::grpc::ServerBuilder builder;
  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {

  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  if (FLAGS_rpc_disable_reuse_port) {
+    builder.SetOption(
+        std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
+  }
  builder.RegisterService(&service_);

  for (auto t : rpc_call_map_) {

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/port.h"

+DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
+
 namespace paddle {
 namespace operators {
 namespace distributed {

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -16,6 +16,9 @@

 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+DECLARE_bool(enable_cublas_tensor_op_math);

 namespace paddle {
 namespace operators {
@@ -42,11 +45,44 @@ struct CUBlas<float> {
  }

  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
    PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...));
 #else
    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const float *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const float *beta, void *C,
+                      cudaDataType_t Ctype, int ldc) {
+    // Because the gcc 4.8 doesn't expand template parameter pack that
+    // appears in a lambda-expression, I can not use template parameter pack
+    // here.
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      VLOG(5) << "use_tensor_op_math: "
+              << (platform::TensorCoreAvailable() ? "True" : "False");
+      PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc));
+#else
+      PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
  }
 };
@@ -69,13 +105,18 @@ struct CUBlas<double> {
  }

  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
    PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...));
 #else
    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
 #endif
  }
+
+  template <typename... ARGS>
+  static void GEMM_EX(ARGS... args) {
+    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+  }
 };

 template <>
@@ -96,14 +137,16 @@ struct CUBlas<platform::float16> {
                                       reinterpret_cast<__half *>(C), ldc));
  }

-  static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa,
-                         cublasOperation_t transb, int m, int n, int k,
-                         const float16 *alpha, const float16 *A, int lda,
-                         long long int strideA, const float16 *B,  // NOLINT
-                         int ldb, long long int strideB,           // NOLINT
-                         const float16 *beta, float16 *C, int ldc,
-                         long long int strideC,  // NOLINT
-                         int batchCount) {
+  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb, int m, int n, int k,
+                                 const float16 *alpha, const float16 *A,
+                                 int lda, long long int strideA,  // NOLINT
+                                 const float16 *B,                // NOLINT
+                                 int ldb, long long int strideB,  // NOLINT
+                                 const float16 *beta, float16 *C, int ldc,
+                                 long long int strideC,  // NOLINT
+                                 int batchCount) {
 #if CUDA_VERSION >= 8000
    PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
        handle, transa, transb, m, n, k,
@@ -114,6 +157,45 @@ struct CUBlas<platform::float16> {
        ldc, strideC, batchCount));
 #else
    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const void *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const void *beta, void *C,
+                      cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo));
+#else
+      PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
  }
 };
@@ -133,8 +215,21 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
  cublasOperation_t cuTransB =
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;

-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, N);
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, N);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
 }

 template <>
@@ -157,30 +252,18 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
                    "cublas fp16 gemm requires GPU compute capability >= 53");

-#if CUDA_VERSION >= 8000
  float h_alpha = static_cast<float>(alpha);
  float h_beta = static_cast<float>(beta);

-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-  if (context_.GetComputeCapability() >= 70) {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
-  }
-#endif  // CUDA_VERSION >= 9000
-
+#if CUDA_VERSION >= 8000
  // cublasHgemm does true FP16 computation which is slow for non-Volta
  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
  // input/output in fp16, computation in fp32, which can also be accelerated
  // using tensor cores in volta GPUs.
-  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
-      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
-      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
-      CUDA_R_32F, algo));
+  auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
+      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
 #else
  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
@@ -199,8 +282,38 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
  // the cblas convention.
  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, ldc);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    bool transA, bool transB, int M, int N, int K, platform::float16 alpha,
+    const platform::float16 *A, int lda, const platform::float16 *B, int ldb,
+    platform::float16 beta, platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, A, lda, &beta, C,
+                                  ldc);
 }

 template <>
@@ -238,9 +351,34 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  const int64_t strideC = M * N;

-  CUBlas<T>::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
-                        &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc,
-                        strideC, batchCount);
+#if CUDA_VERSION >= 9010
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto cublas_call = [&]() {
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
+          context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B,
+          CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C,
+          CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo));
+    };
+    auto &dev_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    CUBlas<T>::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, strideB, A, lda,
+                                  strideA, &beta, C, ldc, strideC, batchCount);
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
 }

 }  // namespace math

--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/jit_kernel.h"

-DECLARE_int32(paddle_num_threads);
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
                           .template Get<jitkernel::VAddKernel<T>>(N);

 #ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#pragma omp parallel for
 #endif
    for (int i = 0; i < M; i++) {
      T* dst = Y + i * N;

--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
    out_inx_dim[0] = inx.size();
    out_inx.Resize(out_inx_dim);

+    auto &local_scope = scope.NewScope();
    std::string var_name = "out_index";
-    framework::Variable *tmp_index_var =
-        const_cast<framework::Scope &>(scope).Var(var_name);
+    framework::Variable *tmp_index_var = local_scope.Var(var_name);
    auto &tmp_index_tensor =
        *(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
    tmp_index_tensor.Resize(out_inx_dim);
@@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
    out_dims[axis] = out_dim_sum;
    out.Resize(out_dims);

-    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
-    // Invoke Reshape Op
+    LodTensorArray2LodTensorVector(local_scope, base_name, Input("X"), &names);
+    // Invoke concat Op
    auto concat_op = framework::OpRegistry::CreateOp(
        "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);

-    concat_op->Run(scope, place);
+    concat_op->Run(local_scope, place);
  }
 };


--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
-  omp_set_num_threads(num_threads);
+  omp_set_num_threads(real_num_threads);
 #else
  PADDLE_ENFORCE(false, "To be implemented.");
 #endif

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
  std::unique_ptr<std::lock_guard<std::mutex>> guard_;
 };

+#if CUDA_VERSION >= 9000
+class ScopedCublasMathMode {
+ public:
+  ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
+      : handle_(handle) {
+    need_reset = false;
+    PADDLE_ENFORCE(
+        platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
+        "Failed to get old cublas math mode");
+    if (old_math_mode_ != new_math_mode) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, new_math_mode),
+          "Failed to set old cublas math mode");
+      need_reset = true;
+    }
+  }
+
+  ~ScopedCublasMathMode() {
+    if (need_reset) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
+          "Failed to set old cublas math mode");
+    }
+  }
+
+ private:
+  cublasHandle_t handle_;
+  cublasMath_t old_math_mode_;
+  bool need_reset;
+};
+
+#endif
+
 class CUDADeviceContext : public DeviceContext {
 public:
  explicit CUDADeviceContext(CUDAPlace place);
@@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
    callback_manager_->Wait();
  }

+#if CUDA_VERSION >= 9000
+  /*! \brief CublasCall may need to change cublas's config,
+   *  but the cublas may be hold by multi-thread, so we should
+   *  add lock here. */
+  template <typename Callback>
+  void CublasCall(Callback callback, cublasMath_t new_math) {
+    std::lock_guard<std::mutex> guard(cublas_mtx_);
+    ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
+    callback();
+  }
+#endif
+
 private:
  CUDAPlace place_;

@@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
  mutable std::mutex callback_mtx_;
  std::unique_ptr<StreamCallbackManager> callback_manager_;
+
+  mutable std::mutex cublas_mtx_;
 };

 template <>

--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
 #endif

+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
+CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
  extern DynLoad__##__name __name
 #endif

-#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
  __macro(cublasSaxpy_v2);                \
  __macro(cublasDaxpy_v2);                \
@@ -106,11 +103,22 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)

 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+  __macro(cublasSetMathMode);                \
+  __macro(cublasGetMathMode);

 CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif

+// APIs available after CUDA 9.1
+#if CUDA_VERSION >= 9010
+#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(cublasGemmBatchedEx);              \
+  __macro(cublasGemmStridedBatchedEx);
+
+CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -35,6 +35,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
              "additional trunks of the same size will be requested from gpu "
              "until the gpu has no memory left for another trunk.");

+DEFINE_bool(
+    enable_cublas_tensor_op_math, false,
+    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
+    "but it may loss precision. Currently, There are two CUDA libraries that"
+    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
+    " GEMM computations(the matrices must be either half precision or single "
+    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
+    "input and output must be half precision) and recurrent neural networks "
+    "(RNNs).");
+
 namespace paddle {
 namespace platform {

@@ -73,6 +83,16 @@ int GetCUDADriverVersion(int id) {
  return driver_version;
 }

+bool TensorCoreAvailable() {
+#if CUDA_VERSION >= 9000
+  int device = GetCurrentDeviceId();
+  int driver_version = GetCUDAComputeCapability(device);
+  return driver_version >= 70;
+#else
+  return false;
+#endif
+}
+
 int GetCUDAMultiProcessors(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  int count;

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
 //! Get the driver version of the ith GPU
 int GetCUDADriverVersion(int id);

+//! Wheter the current device support TensorCore
+bool TensorCoreAvailable();
+
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);


--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -5,8 +5,8 @@ if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED
      SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+      DEPS ARCHIVE_START ${PYBIND_DEPS}
+      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
  else()
    cc_library(paddle_pybind SHARED
      SRCS ${PYBIND_SRCS}

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -94,6 +94,30 @@ function cmake_gen() {
            else
                exit 1
            fi
+        elif [ "$1" == "cp36-cp36m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp37-cp37m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
        fi
    else
        if [ "$1" != "" ]; then
@@ -116,6 +140,18 @@ function cmake_gen() {
                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
+            elif [ "$1" == "cp36-cp36m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
+            elif [ "$1" == "cp37-cp37m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
           fi
        fi
    fi
@@ -419,7 +455,7 @@ function assert_api_not_changed() {
    source .env/bin/activate
    pip install ${PADDLE_ROOT}/build/python/dist/*whl
    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-    if [ "$1" == "cp35-cp35m" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
        # Use sed to make python2 and python3 sepc keeps the same
        sed -i 's/arg0: str/arg0: unicode/g' new.spec
        sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
  for (int i = 0; i < argc; ++i) {
    new_argv.push_back(argv[i]);
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  new_argv.push_back(
      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
 #else

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -129,11 +129,13 @@ def __bootstrap__():
        read_env_flags.append('rpc_send_thread_num')
        read_env_flags.append('rpc_get_thread_num')
        read_env_flags.append('rpc_prefetch_thread_num')
+        read_env_flags.append('rpc_disable_reuse_port')

    if core.is_compiled_with_cuda():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
-            'conv_workspace_size_limit', 'cudnn_exhaustive_search'
+            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
+            'cudnn_exhaustive_search'
        ]
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table):

 def increment(x, value=1.0, in_place=True):
    """
-    This function performs an operation that increments each value in the
+    This function performs an operation that increments the value in the
    input :math:`x` by an amount: :math:`value` as mentioned in the input
-    parameter. This operation is performed in-place by default.
+    parameter. This operation is performed in-place by default. Notice that
+    the number of elements in :math:`x` must be equal to 1.

    Args:
        x (Variable|list): The tensor that has the input values.
@@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True):
    Examples:
        .. code-block:: python

-          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.data(name='data', shape=[1], dtype='float32',
+                                   append_batch_size=False)
          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
    """
    helper = LayerHelper("increment", **locals())

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
                      clip=False,
                      steps=[0.0, 0.0],
                      offset=0.5,
+                      flatten_to_2d=False,
                      name=None):
    """
    **Density Prior Box Operator**
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
            height/weight of the input will be automatically calculated.
            Default: [0., 0.]
       offset(float): Prior boxes center offset. Default: 0.5
+       flatten_to_2d(bool): Whether to flatten output prior boxes and variance
+           to 2D shape, the second dim is 4. Default: False.
       name(str): Name of the density prior box op. Default: None.

    Returns:
        tuple: A tuple with two Variable (boxes, variances)

        boxes: the output density prior boxes of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input,
-        num_priors is the total
-        box count of each position of input.
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input,
+            num_priors is the total box count of each position of input.

        variances: the expanded variances of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_priors is the total
-        box count of each position of input
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input
+            num_priors is the total box count of each position of input.


    Examples:
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
            box, var = fluid.layers.density_prior_box(
                input=conv1,
                image=images,
-                min_sizes=[100.],
-                max_sizes=[200.],
-                aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
-                densities=[3, 4],
-                fixed_sizes=[50., 60.],
-                fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
-                flip=True,
-                clip=True)
+                densities=[4, 2, 1],
+                fixed_sizes=[32.0, 64.0, 128.0],
+                fixed_ratios=[1.],
+                clip=True,
+                flatten_to_2d=True)
    """
    helper = LayerHelper("density_prior_box", **locals())
    dtype = helper.input_dtype()
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
        'step_w': steps[0],
        'step_h': steps[1],
        'offset': offset,
+        'densities': densities,
+        'fixed_sizes': fixed_sizes,
+        'fixed_ratios': fixed_ratios,
+        'flatten_to_2d': flatten_to_2d,
    }
-    if densities is not None and len(densities) > 0:
-        attrs['densities'] = densities
-    if fixed_sizes is not None and len(fixed_sizes) > 0:
-        attrs['fixed_sizes'] = fixed_sizes
-    if fixed_ratios is not None and len(fixed_ratios) > 0:
-        attrs['fixed_ratios'] = fixed_ratios
-
    box = helper.create_variable_for_type_inference(dtype)
    var = helper.create_variable_for_type_inference(dtype)
    helper.append_op(

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2134,11 +2134,16 @@ def pool2d(input,
                          input tensor is NCHW, where N is batch size, C is
                          the number of channels, H is the height of the
                          feature, and W is the width of the feature.
-        pool_size (int): The side length of pooling windows. All pooling
-                         windows are squares with pool_size on a side.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
        pool_type: ${pooling_type_comment}
-        pool_stride (int): stride of the pooling layer.
-        pool_padding (int): padding size.
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            Otherwise, the pool padding size will be a square of an int.
        global_pooling (bool): ${global_pooling_comment}
        use_cudnn (bool): ${use_cudnn_comment}
        ceil_mode (bool): ${ceil_mode_comment}

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
                       filter_size,
                       param_attr=None,
                       act="sigmoid",
-                       pool_type="max"):
+                       pool_type="max",
+                       bias_attr=None):
    """
    The sequence_conv_pool is composed with Sequence Convolution and Pooling.

@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
        pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
            average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
            Default :math:`max`.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, sequence_conv
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.

    Return:
        Variable: The final result after Sequence Convolution and Pooling.
@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
+        bias_attr=bias_attr,
        act=act)

    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
        assert len(results[0]) == len(transpiler_results[0])
        for i in range(len(results[0])):
            np.testing.assert_almost_equal(
-                results[0][i], transpiler_results[0][i], decimal=5)
+                results[0][i], transpiler_results[0][i], decimal=4)

        print("infer results: ", results[0])


--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase):

 class TestPriorBox(unittest.TestCase):
    def test_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.prior_box(
-            input=conv1,
-            image=images,
-            min_sizes=[100.0],
-            aspect_ratios=[1.],
-            flip=True,
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.0],
+                aspect_ratios=[1.],
+                flip=True,
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[3] == 4


 class TestDensityPriorBox(unittest.TestCase):
    def test_density_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.density_prior_box(
-            input=conv1,
-            image=images,
-            densities=[3, 4],
-            fixed_sizes=[50., 60.],
-            fixed_ratios=[1.0],
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.density_prior_box(
+                input=conv1,
+                image=images,
+                densities=[3, 4],
+                fixed_sizes=[50., 60.],
+                fixed_ratios=[1.0],
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[-1] == 4


 class TestAnchorGenerator(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
            'offset': self.offset,
            'densities': self.densities,
            'fixed_sizes': self.fixed_sizes,
-            'fixed_ratios': self.fixed_ratios
+            'fixed_ratios': self.fixed_ratios,
+            'flatten_to_2d': self.flatten_to_2d
        }
        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}

@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
        self.set_data()

    def set_density(self):
-        self.densities = []
-        self.fixed_sizes = []
-        self.fixed_ratios = []
+        self.densities = [4, 2, 1]
+        self.fixed_sizes = [32.0, 64.0, 128.0]
+        self.fixed_ratios = [1.0]
+        self.layer_w = 17
+        self.layer_h = 17
+        self.image_w = 533
+        self.image_h = 533
+        self.flatten_to_2d = False

    def init_test_params(self):
-        self.layer_w = 32
-        self.layer_h = 32
-
-        self.image_w = 40
-        self.image_h = 40
+        self.set_density()

        self.step_w = float(self.image_w) / float(self.layer_w)
        self.step_h = float(self.image_h) / float(self.layer_h)
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
        self.variances = [0.1, 0.1, 0.2, 0.2]
        self.variances = np.array(self.variances, dtype=np.float).flatten()

-        self.set_density()
-
        self.clip = True
        self.num_priors = 0
        if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
                          (self.layer_h, self.layer_w, self.num_priors, 1))
        self.out_boxes = out_boxes.astype('float32')
        self.out_var = out_var.astype('float32')
+        if self.flatten_to_2d:
+            self.out_boxes = self.out_boxes.reshape((-1, 4))
+            self.out_var = self.out_var.reshape((-1, 4))


 class TestDensityPriorBox(TestDensityPriorBoxOp):
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
        self.densities = [3, 4]
        self.fixed_sizes = [1.0, 2.0]
        self.fixed_ratios = [1.0]
+        self.layer_w = 32
+        self.layer_h = 32
+        self.image_w = 40
+        self.image_h = 40
+        self.flatten_to_2d = True


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -202,6 +202,17 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
        print(str(program))

+    def test_pool2d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
+            self.assertIsNotNone(
+                layers.pool2d(
+                    x,
+                    pool_size=[5, 3],
+                    pool_stride=[1, 2],
+                    pool_padding=(2, 1)))
+
    def test_lstm_unit(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -145,10 +145,15 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
        lod.append(nmsed_num)
        if nmsed_num == 0: continue

+        tmp_det_out = []
        for c, indices in nmsed_outs.items():
            for idx in indices:
                xmin, ymin, xmax, ymax = boxes[n][idx][:]
-                det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+                tmp_det_out.append(
+                    [c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+        sorted_det_out = sorted(
+            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        det_outs.extend(sorted_det_out)

    return det_outs, lod

@@ -210,7 +215,7 @@ class TestMulticlassNMSOp(OpTest):
 class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
    def set_argument(self):
        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0 
+        # In practical use, 0.0 < score_threshold < 1.0
        self.score_threshold = 2.0



--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
-  bash build_scripts/install_nccl2.sh && rm -r build_scripts
+  bash build_scripts/install_nccl2.sh && rm -rf build_scripts

 ENV SSL_CERT_FILE=/opt/_internal/certs.pem


--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -50,6 +50,15 @@ function do_cpython_build {
    mkdir -p ${prefix}/lib
    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6

+    if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then
+        wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
+        tar -zxf sqlite-autoconf-3250300.tar.gz
+        cd sqlite-autoconf-3250300
+        ./configure --prefix=/usr/local
+        make -j8 && make install
+        cd ../ && rm sqlite-autoconf-3250300.tar.gz
+    fi
+
    # NOTE --enable-shared for generating libpython shared library needed for
    # linking of some of the nupic.core test executables.
    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
@@ -59,9 +68,9 @@ function do_cpython_build {
        make -j8 > /dev/null
        make altinstall > /dev/null
    else
-        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-        make -j8 > /dev/null
-        make install > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null
    fi
    popd
    echo "ZZZ looking for libpython"