提交 547225dc 编写于 作者: Z zlsh80826

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into nvinfer_plugin_exp_merge

...@@ -28,7 +28,10 @@ include(generic) # simplify cmake module ...@@ -28,7 +28,10 @@ include(generic) # simplify cmake module
# TODO(Shibo Tao): remove find_package(CUDA) completely. # TODO(Shibo Tao): remove find_package(CUDA) completely.
find_package(CUDA QUIET) find_package(CUDA QUIET)
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15)) if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. " message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
......
...@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ...@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU ARG WITH_GPU
ARG WITH_AVX ARG WITH_AVX
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_AVX=${WITH_AVX:-ON}
...@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 ...@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue. # version util jupyter fixes this issue.
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
...@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort ...@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 --no-cache-dir install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
RUN pip3 --no-cache-dir install coverage RUN pip3 --no-cache-dir install coverage
RUN pip3.6 --no-cache-dir install coverage RUN pip3.6 --no-cache-dir install coverage
RUN pip3.7 --no-cache-dir install coverage RUN pip3.7 --no-cache-dir install coverage
RUN pip --no-cache-dir install coverage RUN pip --no-cache-dir install coverage
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
...@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure] ...@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip --no-cache-dir install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq
RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
(cd /woboq \
cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-DCMAKE_BUILD_TYPE=Release . \
make)
# ar mishandles 4GB files # ar mishandles 4GB files
# https://sourceware.org/bugzilla/show_bug.cgi?id=14625 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
......
...@@ -33,7 +33,7 @@ pip install paddlepaddle ...@@ -33,7 +33,7 @@ pip install paddlepaddle
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.3.post97 pip install paddlepaddle-gpu==1.8.4.post97
``` ```
It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website. It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
......
...@@ -30,7 +30,7 @@ pip install paddlepaddle ...@@ -30,7 +30,7 @@ pip install paddlepaddle
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.3.post97 pip install paddlepaddle-gpu==1.8.4.post97
``` ```
更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html) 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)
......
...@@ -63,6 +63,11 @@ if(WITH_BOX_PS) ...@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS) add_definitions(-DPADDLE_WITH_BOX_PS)
endif() endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)
endif()
if(WITH_GPU) if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
......
...@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable) ...@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable)
if(NOT CUDA_gpu_detect_output) if(NOT CUDA_gpu_detect_output)
message(STATUS "Automatic GPU detection failed. Building for all known architectures.") message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
#Todo: fix Automatic GPU detection failed on windows
if(WIN32)
set(${out_variable} "61 75" PARENT_SCOPE)
endif()
else() else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif() endif()
...@@ -202,6 +206,11 @@ if (NOT WIN32) # windows msvc2015 support c++11 natively. ...@@ -202,6 +206,11 @@ if (NOT WIN32) # windows msvc2015 support c++11 natively.
set(CMAKE_CUDA_STANDARD 11) set(CMAKE_CUDA_STANDARD 11)
endif(NOT WIN32) endif(NOT WIN32)
# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
# So replace /W[1-4] with /W0
if (WIN32)
string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
endif(WIN32)
# in cuda9, suppress cuda warning on eigen # in cuda9, suppress cuda warning on eigen
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings # Set :expt-relaxed-constexpr to suppress Eigen warnings
......
...@@ -17,7 +17,7 @@ include(ExternalProject) ...@@ -17,7 +17,7 @@ include(ExternalProject)
set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub) set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub) set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
set(CUB_REPOSITORY https://github.com/NVlabs/cub.git) set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
set(CUB_TAG 1.9.8) set(CUB_TAG 1.8.0)
cache_third_party(extern_cub cache_third_party(extern_cub
REPOSITORY ${CUB_REPOSITORY} REPOSITORY ${CUB_REPOSITORY}
......
...@@ -14,13 +14,21 @@ ...@@ -14,13 +14,21 @@
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
SET(GLOO_PROJECT "extern_gloo") SET(GLOO_PROJECT "extern_gloo")
IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL)) IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
MESSAGE(STATUS "use pre defined download url") MESSAGE(STATUS "use pre defined download url")
SET(GLOO_VER "master" CACHE STRING "" FORCE) SET(GLOO_VER "master" CACHE STRING "" FORCE)
SET(GLOO_NAME "gloo" CACHE STRING "" FORCE) SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
SET(GLOO_URL "https://pslib.bj.bcebos.com/gloo.tar.gz" CACHE STRING "" FORCE)
if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
else()
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
endif()
ENDIF() ENDIF()
MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}") MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo") SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo")
SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}") SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
......
...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
if(NOT LITE_GIT_TAG) if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa) set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
endif() endif()
if(NOT CUDA_ARCH_NAME) if(NOT CUDA_ARCH_NAME)
......
...@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) ...@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git) SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git)
SET(MKLDNN_TAG fb95345126ade4c54f5507e580a5f5da8d30a515) SET(MKLDNN_TAG 1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
# Introduce variables: # Introduce variables:
# * CMAKE_INSTALL_LIBDIR # * CMAKE_INSTALL_LIBDIR
......
if (NOT WITH_XPU)
return()
endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/api/include")
SET(XPU_RUNTIME_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_SIM_LIB_NAME "libxpusim.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(XPU_SIM_LIB "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
&& tar xvf xpu.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
# generate a static dummy target to track xpulib dependencies
# for cc_library(xxx SRCS xxx.c DEPS xpulib)
generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
...@@ -232,7 +232,9 @@ if(WIN32) ...@@ -232,7 +232,9 @@ if(WIN32)
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
set(flag_var "${flag_var} /w") endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var) endforeach(flag_var)
endif() endif()
...@@ -384,8 +384,12 @@ function(cc_test_run TARGET_NAME) ...@@ -384,8 +384,12 @@ function(cc_test_run TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes. # No unit test should exceed 2 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) if (APPLE OR WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
else()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif()
endif() endif()
endfunction() endfunction()
...@@ -742,9 +746,14 @@ function(py_test TARGET_NAME) ...@@ -742,9 +746,14 @@ function(py_test TARGET_NAME)
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
if (APPLE OR WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
else()
# No unit test should exceed 2 minutes in Linux.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif()
# No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
endif() endif()
endfunction() endfunction()
......
...@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST) ...@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib) DSTS ${dst_dir} ${dst_dir}/lib)
if (WITH_CRYPTO)
set(dst_dir "${DST}/third_party/install/cryptopp") set(dst_dir "${DST}/third_party/install/cryptopp")
copy(${TARGET} copy(${TARGET}
SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES} SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib) DSTS ${dst_dir} ${dst_dir}/lib)
endif()
set(dst_dir "${DST}/third_party/install/xxhash") set(dst_dir "${DST}/third_party/install/xxhash")
copy(${TARGET} copy(${TARGET}
...@@ -187,7 +189,7 @@ copy(inference_lib_dist ...@@ -187,7 +189,7 @@ copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal) DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/) DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
......
# Attention: cmake will append these flags to compile command automatically. # Attention: cmake will append these flags to compile command automatically.
# So if you want to add global option, change this file rather than flags.cmake # So if you want to add global option, change this file rather than flags.cmake
# default: "-g" # NOT WIN32
set(CMAKE_C_FLAGS_DEBUG "-g") # DEBUG: default: "-g"
# default: "-O3 -DNDEBUG" # RELEASE: default: "-O3 -DNDEBUG"
set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG") # RELWITHDEBINFO: default: "-O2 -g -DNDEBUG"
# default: "-O2 -g -DNDEBUG" # MINSIZEREL: default: "-O2 -g -DNDEBUG"
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
# default: "-Os -DNDEBUG" if(NOT WIN32)
set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG") set(CMAKE_C_FLAGS_DEBUG "-g")
set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
endif()
if(WITH_GPU)
set(CMAKE_CUDA_FLAGS_DEBUG "-g")
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
endif()
# default: "-g"
set(CMAKE_CXX_FLAGS_DEBUG "-g")
# default: "-O3 -DNDEBUG"
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
# default: "-O2 -g -DNDEBUG"
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
# default: "-Os -DNDEBUG"
set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
# default: "-g"
set(CMAKE_CUDA_FLAGS_DEBUG "-g")
# default: "-O3 -DNDEBUG"
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
# default: "-O2 -g -DNDEBUG"
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
# default: "-O1 -DNDEBUG"
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
...@@ -8,12 +8,13 @@ function(op_library TARGET) ...@@ -8,12 +8,13 @@ function(op_library TARGET)
set(hip_cu_srcs) set(hip_cu_srcs)
set(miopen_hip_cc_srcs) set(miopen_hip_cc_srcs)
set(cu_cc_srcs) set(cu_cc_srcs)
set(xpu_cc_srcs)
set(cudnn_cu_cc_srcs) set(cudnn_cu_cc_srcs)
set(cudnn_cu_srcs) set(cudnn_cu_srcs)
set(CUDNN_FILE) set(CUDNN_FILE)
set(mkldnn_cc_srcs) set(mkldnn_cc_srcs)
set(MKLDNN_FILE) set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
...@@ -60,6 +61,12 @@ function(op_library TARGET) ...@@ -60,6 +61,12 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
endif() endif()
endif() endif()
if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
endif()
endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.hip.cu$") if (${src} MATCHES ".*\\.hip.cu$")
...@@ -76,6 +83,8 @@ function(op_library TARGET) ...@@ -76,6 +83,8 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
else() else()
...@@ -109,7 +118,7 @@ function(op_library TARGET) ...@@ -109,7 +118,7 @@ function(op_library TARGET)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
endif() endif()
...@@ -150,10 +159,11 @@ function(op_library TARGET) ...@@ -150,10 +159,11 @@ function(op_library TARGET)
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len) list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -179,6 +189,9 @@ function(op_library TARGET) ...@@ -179,6 +189,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif() endif()
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN # pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator # Append first implemented MKLDNN activation operator
...@@ -228,6 +241,7 @@ function(register_operators) ...@@ -228,6 +241,7 @@ function(register_operators)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}") string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS) list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len) list(LENGTH register_operators_DEPS register_operators_DEPS_len)
......
...@@ -250,6 +250,11 @@ if(WITH_GPU) ...@@ -250,6 +250,11 @@ if(WITH_GPU)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_XPU)
include(external/xpu) # download, build, install xpu
list(APPEND third_party_deps extern_xpu)
endif(WITH_XPU)
if(WITH_PSLIB) if(WITH_PSLIB)
include(external/pslib) # download, build, install pslib include(external/pslib) # download, build, install pslib
list(APPEND third_party_deps extern_pslib) list(APPEND third_party_deps extern_pslib)
...@@ -263,10 +268,6 @@ if(WITH_PSLIB) ...@@ -263,10 +268,6 @@ if(WITH_PSLIB)
endif() endif()
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
include(external/box_ps) include(external/box_ps)
...@@ -274,6 +275,11 @@ if(WITH_BOX_PS) ...@@ -274,6 +275,11 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS) endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC) if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc) list(APPEND third_party_deps extern_grpc)
else() else()
......
...@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() { ...@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
C.PD_EnableMkldnnQuantizer(config.c) C.PD_EnableMkldnnQuantizer(config.c)
} }
func (config *AnalysisConfig) EnableMkldnnBfloat16() {
C.PD_EnableMkldnnBfloat16(config.c)
}
func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool { func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c)) return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
} }
func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
}
// SetModelBuffer // SetModelBuffer
// ModelFromMemory // ModelFromMemory
......
...@@ -119,9 +119,13 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_ ...@@ -119,9 +119,13 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context) device_context)
cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context) cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
...@@ -164,23 +168,23 @@ if(WITH_PYTHON) ...@@ -164,23 +168,23 @@ if(WITH_PYTHON)
if (NOT WIN32) if (NOT WIN32)
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else(NOT WIN32) else(NOT WIN32)
string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/") string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/") string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
COMMAND copy /Y *.py ${proto_dstpath} COMMAND copy /Y *.py ${proto_dstpath}
COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/fluid/proto."
COMMENT "Copy generated python proto into directory paddle/fleet/proto." COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32) endif(NOT WIN32)
endif() endif()
...@@ -268,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib ...@@ -268,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer) cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer) cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
cc_library(generator SRCS generator.cc)
# Get the current working branch # Get the current working branch
execute_process( execute_process(
......
...@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item, ...@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
TensorCopy(src_item, platform::CPUPlace(), dst_item); TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif #endif
} else { } else {
dst_item->ShareDataWith(src_item); TensorCopy(src_item, platform::CPUPlace(), dst_item);
} }
} else { } else {
dst_item->clear(); dst_item->clear();
......
...@@ -55,9 +55,8 @@ message LarsConfig { ...@@ -55,9 +55,8 @@ message LarsConfig {
} }
message LambConfig { message LambConfig {
optional float beta1 = 1 [ default = 0.001 ]; optional float lamb_weight_decay = 1 [ default = 0.01 ];
optional float beta2 = 2 [ default = 0.999 ]; repeated string exclude_from_weight_decay = 2;
optional float epsilon = 3 [ default = 0.000001 ];
} }
message BuildStrategy { message BuildStrategy {
...@@ -80,7 +79,7 @@ message ExecutionStrategy { ...@@ -80,7 +79,7 @@ message ExecutionStrategy {
} }
message AsyncConfig { message AsyncConfig {
optional int32 k_steps = 1 [ default = 1 ]; optional int32 k_steps = 1 [ default = -1 ];
optional int32 max_merge_var_num = 2 [ default = 1 ]; optional int32 max_merge_var_num = 2 [ default = 1 ];
optional int32 send_queue_size = 3 [ default = 16 ]; optional int32 send_queue_size = 3 [ default = 16 ];
optional bool independent_recv_thread = 4 [ default = false ]; optional bool independent_recv_thread = 4 [ default = false ];
...@@ -114,7 +113,9 @@ message DistributedStrategy { ...@@ -114,7 +113,9 @@ message DistributedStrategy {
optional bool fuse_all_reduce_ops = 18 [ default = true ]; optional bool fuse_all_reduce_ops = 18 [ default = true ];
optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ]; optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ]; optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
// optional bool enable_backward_optimizer_op_deps = 19 [ default = true ]; optional bool cudnn_exhaustive_search = 21 [ default = true ];
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional RecomputeConfig recompute_configs = 101; optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102; optional AMPConfig amp_configs = 102;
......
...@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
return ctx; return ctx;
} }
inline ::DLContext operator()(const platform::XPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const { inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
::DLContext ctx; ::DLContext ctx;
......
...@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
int64_t max_memory_size = GetEagerDeletionThreshold(); int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
if (!ctx->force_disable_gc_ && max_memory_size >= 0) { if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
...@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
gc.reset(new DefaultStreamGarbageCollector( gc.reset(new DefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
} }
} else if (platform::is_cpu_place(place_)) { #else
PADDLE_THROW(
platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
#endif #endif
} else if (platform::is_cpu_place(place_)) {
gc.reset(new CPUGarbageCollector( gc.reset(new CPUGarbageCollector(
BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
#ifdef PADDLE_WITH_CUDA } else if (platform::is_xpu_place(place_)) {
} #ifdef PADDLE_WITH_XPU
gc.reset(new XPUGarbageCollector(
BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif #endif
}
} }
for (int64_t i = start_op_index; i < end_op_index; ++i) { for (int64_t i = start_op_index; i < end_op_index; ++i) {
......
...@@ -19,6 +19,6 @@ else() ...@@ -19,6 +19,6 @@ else()
cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_GLOO) endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context) cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
...@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP }; ...@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP };
class GlooWrapper { class GlooWrapper {
public: public:
static std::shared_ptr<GlooWrapper> GetInstance() {
static auto s_instance = std::make_shared<GlooWrapper>();
return s_instance;
}
GlooWrapper() {} GlooWrapper() {}
virtual ~GlooWrapper() {} virtual ~GlooWrapper() {}
...@@ -153,6 +158,11 @@ class GlooWrapper { ...@@ -153,6 +158,11 @@ class GlooWrapper {
#endif #endif
} }
bool IsInitialized() { return is_initialized_; }
#ifdef PADDLE_WITH_GLOO
std::shared_ptr<gloo::Context> GetContext() { return context_; }
#endif
template <typename T> template <typename T>
std::vector<T> AllReduce(std::vector<T>& sendbuf, // NOLINT std::vector<T> AllReduce(std::vector<T>& sendbuf, // NOLINT
const std::string& mode = "sum") { // NOLINT const std::string& mode = "sum") { // NOLINT
......
...@@ -115,6 +115,7 @@ message VarType { ...@@ -115,6 +115,7 @@ message VarType {
SIZE_T = 19; SIZE_T = 19;
UINT8 = 20; UINT8 = 20;
INT8 = 21; INT8 = 21;
BF16 = 22;
// Other types that may need additional descriptions // Other types that may need additional descriptions
LOD_TENSOR = 7; LOD_TENSOR = 7;
......
...@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) { ...@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback(); callback();
} }
#ifdef PADDLE_WITH_XPU
XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback();
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
const platform::CUDAPlace &place, size_t max_memory_size) const platform::CUDAPlace &place, size_t max_memory_size)
......
...@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector { ...@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
void ClearCallback(const std::function<void()> &callback) override; void ClearCallback(const std::function<void()> &callback) override;
}; };
#ifdef PADDLE_WITH_XPU
class XPUGarbageCollector : public GarbageCollector {
public:
XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class UnsafeFastGPUGarbageCollector : public GarbageCollector { class UnsafeFastGPUGarbageCollector : public GarbageCollector {
public: public:
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <deque>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/generator.h"
namespace paddle {
namespace framework {
std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
GeneratorState* Generator::GetState() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_.get();
}
void Generator::SetState(GeneratorState* state_in) {
std::lock_guard<std::mutex> lock(this->mutex);
*this->state_ = *state_in;
}
uint64_t Generator::GetCurrentSeed() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->current_seed;
}
uint64_t Generator::Seed() {
std::lock_guard<std::mutex> lock(this->mutex);
uint64_t seed;
std::random_device de;
seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
this->state_->current_seed = seed;
std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq);
return this->state_->current_seed;
}
void Generator::SetCurrentSeed(uint64_t seed) {
std::lock_guard<std::mutex> lock(this->mutex);
this->state_->current_seed = uint64_t(seed);
std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq);
}
std::mt19937_64& Generator::GetCPUEngine() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->cpu_engine;
}
void Generator::SetCPUEngine(std::mt19937_64 engine) {
std::lock_guard<std::mutex> lock(this->mutex);
this->state_->cpu_engine = std::mt19937_64(engine);
}
uint64_t Generator::Random64() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->cpu_engine();
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <atomic>
#include <deque>
#include <iostream> // temp for debug
#include <memory>
#include <mutex> // NOLINT
#include <random>
#include <typeinfo>
#include <utility>
namespace paddle {
namespace framework {
struct GeneratorState {
int64_t device = -1;
uint64_t current_seed = 34342423252;
std::mt19937_64 cpu_engine;
};
struct Generator {
Generator() {
GeneratorState default_gen_state_cpu;
default_gen_state_cpu.device = -1;
default_gen_state_cpu.current_seed = 34342423252;
std::seed_seq seq({34342423252});
default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
}
explicit Generator(GeneratorState state_in)
: state_{std::make_shared<GeneratorState>(state_in)} {}
Generator(const Generator& other)
: Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
// get random state
GeneratorState* GetState();
// set random state
void SetState(GeneratorState* state_in);
// get current seed
uint64_t GetCurrentSeed();
// random a seed and get
uint64_t Seed();
// set seed
void SetCurrentSeed(uint64_t seed);
// get cpu engine
std::mt19937_64& GetCPUEngine();
// set cpu engine
void SetCPUEngine(std::mt19937_64 engine);
uint64_t Random64();
bool is_init_py = false;
// CPU Generator singleton
static std::shared_ptr<Generator> GetInstance() {
if (NULL == gen_instance_) {
gen_instance_.reset(new paddle::framework::Generator());
}
return gen_instance_;
}
static std::shared_ptr<Generator> GetInstanceX() {
if (NULL == gen_instance_) {
gen_instance_.reset(new paddle::framework::Generator());
}
gen_instance_->is_init_py = true;
return gen_instance_;
}
private:
static std::shared_ptr<Generator> gen_instance_;
std::shared_ptr<GeneratorState> state_;
mutable std::mutex mutex;
Generator(const Generator& other, const std::lock_guard<std::mutex>&)
: state_(std::make_shared<GeneratorState>(*(other.state_))) {}
};
} // namespace framework
} // namespace paddle
...@@ -16,9 +16,6 @@ ...@@ -16,9 +16,6 @@
#include "paddle/fluid/framework/io/crypto/aes_cipher.h" #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
#include "paddle/fluid/framework/io/crypto/cipher_utils.h" #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifdef ON_INFER
#include "paddle/fluid/inference/api/paddle_api.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -59,7 +56,7 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher( ...@@ -59,7 +56,7 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher(
} }
} // namespace framework } // namespace framework
#ifdef ON_INFER #ifdef PADDLE_ON_INFERENCE
std::shared_ptr<framework::Cipher> MakeCipher(const std::string& config_file) { std::shared_ptr<framework::Cipher> MakeCipher(const std::string& config_file) {
return framework::CipherFactory::CreateCipher(config_file); return framework::CipherFactory::CreateCipher(config_file);
} }
......
...@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass, ...@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass,
paddle::framework::ir::ConvTransposeBNFusePass); paddle::framework::ir::ConvTransposeBNFusePass);
REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass, REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass); paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass);
REGISTER_PASS(depthwise_conv_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvBNFusePass);
REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
...@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass { ...@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
std::string conv_type() const { return "conv2d_transpose"; } std::string conv_type() const { return "conv2d_transpose"; }
}; };
class DepthwiseConvBNFusePass : public ConvBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};
class DepthwiseConvEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -68,11 +68,35 @@ static bool HasInput(Node* n, std::string name) { ...@@ -68,11 +68,35 @@ static bool HasInput(Node* n, std::string name) {
return input_names_set.find(name) != input_names_set.end(); return input_names_set.find(name) != input_names_set.end();
} }
static Node* GetInputVar(Node* n, const std::string& name) {
PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
platform::errors::InvalidArgument(
"Expected node %p to be an operator node.", n));
for (auto* in : n->inputs) {
if (in->Name() == name) {
return in;
}
}
return nullptr;
}
static Node* GetOutputVar(Node* n, const std::string& name) {
PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
platform::errors::InvalidArgument(
"Expected node %p to be an operator node.", n));
for (auto* out : n->outputs) {
if (out->Name() == name) {
return out;
}
}
return nullptr;
}
std::vector<OperationExpression> CodeGenerator::ConvertToExpressions( std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
SubGraph* subgraph) { SubGraph* subgraph) {
std::unordered_map<std::string, int> var_ids = EncodeVarNodes(subgraph); std::unordered_map<Node*, int> var_ids = EncodeVarNodes(subgraph);
std::vector<Node*> intermediate_out_nodes = std::unordered_set<Node*> intermediate_out_vars_set =
subgraph->GetIntermediateOutVarNodes(); subgraph->GetIntermediateOutVarNodesSet();
std::vector<OperationExpression> expressions; std::vector<OperationExpression> expressions;
for (auto* node : subgraph->SortedNodes()) { for (auto* node : subgraph->SortedNodes()) {
if (node && node->IsOp() && node->Op()) { if (node && node->IsOp() && node->Op()) {
...@@ -92,11 +116,12 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions( ...@@ -92,11 +116,12 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// "elementwise_add_grad", where "X", "Y" and "Out" are not used. // "elementwise_add_grad", where "X", "Y" and "Out" are not used.
if ((HasInput(node, name) && op->Input(name).size() >= 1U)) { if ((HasInput(node, name) && op->Input(name).size() >= 1U)) {
for (size_t i = 0; i < op->Input(name).size(); i++) { for (size_t i = 0; i < op->Input(name).size(); i++) {
Node* input_var = GetInputVar(node, op->Input(name)[i]);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
var_ids.find(op->Input(name)[i]), var_ids.end(), var_ids.find(input_var), var_ids.end(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(%s) of operation %s is not set.", name, op->Type())); "Input(%s) of operation %s is not set.", name, op->Type()));
input_ids.push_back(var_ids[op->Input(name)[i]]); input_ids.push_back(var_ids[input_var]);
} }
} else { } else {
input_ids.push_back(-1); input_ids.push_back(-1);
...@@ -106,31 +131,29 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions( ...@@ -106,31 +131,29 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// Output ids should be set in fixed order, like: // Output ids should be set in fixed order, like:
// - dx, dy in backward operations // - dx, dy in backward operations
std::vector<int> output_ids; std::vector<int> output_ids;
std::vector<int> intermediate_output_ids;
std::vector<std::string> output_names = std::vector<std::string> output_names =
OperationMap::Instance().Get(op->Type()).output_names; OperationMap::Instance().Get(op->Type()).output_names;
std::unordered_map<int, bool> intermediate_state;
for (auto& name : output_names) { for (auto& name : output_names) {
Node* output_var = GetOutputVar(node, op->Output(name)[0]);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
var_ids.find(op->Output(name)[0]), var_ids.end(), var_ids.find(output_var), var_ids.end(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Output(%s) of operation %s is not set.", name, op->Type())); "Output(%s) of operation %s is not set.", name, op->Type()));
output_ids.push_back(var_ids[op->Output(name)[0]]); output_ids.push_back(var_ids[output_var]);
bool enable_intermediate = false; if (!subgraph->SaveIntermediateOut() &&
for (auto* n : intermediate_out_nodes) { intermediate_out_vars_set.find(output_var) !=
if (n->Name() == op->Output(name)[0]) { intermediate_out_vars_set.end()) {
enable_intermediate = true; intermediate_output_ids.push_back(var_ids[output_var]);
break;
}
} }
intermediate_state[var_ids[op->Output(name)[0]]] = enable_intermediate;
} }
std::string lhs_type = ExtractDataType(node->outputs); std::string lhs_type = ExtractDataType(node->outputs);
std::string rhs_type = ExtractDataType(node->inputs); std::string rhs_type = ExtractDataType(node->inputs);
auto expression = auto expression =
OperationExpression(node->Name(), input_ids, output_ids, rhs_type, OperationExpression(node->Name(), input_ids, output_ids, rhs_type,
lhs_type, intermediate_state); lhs_type, intermediate_output_ids);
expression.SetAttr(attr); expression.SetAttr(attr);
expressions.push_back(expression); expressions.push_back(expression);
} }
...@@ -146,17 +169,18 @@ std::string CodeGenerator::Generate( ...@@ -146,17 +169,18 @@ std::string CodeGenerator::Generate(
// TODO(liuyiqun): Check whether all expressions are elementwise operations. // TODO(liuyiqun): Check whether all expressions are elementwise operations.
std::set<int> input_ids = std::move(DistilInputIds(expressions)); std::set<int> input_ids = std::move(DistilInputIds(expressions));
std::set<int> output_ids = std::move(DistilOutputIds(expressions)); std::set<int> output_ids = std::move(DistilOutputIds(expressions));
std::set<int> intermediate_ids = std::set<int> intermediate_output_ids =
std::move(DistilIntermediateIds(expressions)); std::move(DistilIntermediateIds(expressions));
std::unordered_map<int, std::string> dtypes = std::unordered_map<int, std::string> dtypes =
std::move(DistilDtypes(expressions)); std::move(DistilDtypes(expressions));
TemplateVariable template_var; TemplateVariable template_var;
template_var.Add("func_name", func_name); template_var.Add("func_name", func_name);
template_var.Add("parameters", EmitParameters(input_ids, output_ids, template_var.Add(
intermediate_ids, dtypes)); "parameters",
EmitParameters(input_ids, output_ids, intermediate_output_ids, dtypes));
template_var.Add("compute_body", template_var.Add("compute_body",
EmitComputeBody(expressions, input_ids, output_ids, EmitComputeBody(expressions, input_ids, output_ids,
intermediate_ids, dtypes)); intermediate_output_ids, dtypes));
std::set<std::string> all_dtype; std::set<std::string> all_dtype;
for (const auto& type : dtypes) { for (const auto& type : dtypes) {
...@@ -204,18 +228,14 @@ std::set<int> CodeGenerator::DistilOutputIds( ...@@ -204,18 +228,14 @@ std::set<int> CodeGenerator::DistilOutputIds(
std::set<int> CodeGenerator::DistilIntermediateIds( std::set<int> CodeGenerator::DistilIntermediateIds(
const std::vector<OperationExpression>& expressions) { const std::vector<OperationExpression>& expressions) {
std::set<int> intermediate_ids; std::set<int> intermediate_output_ids;
// Use std::set to remove the reptead id and get a ordered list. // Use std::set to remove the reptead id and get a ordered list.
for (size_t i = 0; i < expressions.size(); i++) { for (size_t i = 0; i < expressions.size(); i++) {
for (auto id : expressions[i].GetOutputIds()) { for (auto id : expressions[i].GetIntermediateOutputIds()) {
auto intermediate_state = expressions[i].GetIntermediateState(); intermediate_output_ids.insert(id);
if (intermediate_state.find(id) != intermediate_state.end() &&
intermediate_state[id]) {
intermediate_ids.insert(id);
}
} }
} }
return intermediate_ids; return intermediate_output_ids;
} }
std::unordered_map<int, std::string> CodeGenerator::DistilDtypes( std::unordered_map<int, std::string> CodeGenerator::DistilDtypes(
...@@ -316,26 +336,29 @@ std::string CodeGenerator::EmitComputeBody( ...@@ -316,26 +336,29 @@ std::string CodeGenerator::EmitComputeBody(
return load.str() + compute.str() + store.str(); return load.str() + compute.str() + store.str();
} }
std::unordered_map<std::string, int> CodeGenerator::EncodeVarNodes( std::unordered_map<Node*, int> CodeGenerator::EncodeVarNodes(
SubGraph* subgraph) { SubGraph* subgraph) {
const auto& input_var_nodes = subgraph->GetInputVarNodes(); const auto& input_var_nodes = subgraph->GetInputVarNodes();
const auto& output_var_nodes = subgraph->GetOutputVarNodes(); // Encode all var nodes, including intermediate output var nodes.
const auto& output_var_nodes = subgraph->GetOutputVarNodes(true);
int id = 0; int id = 0;
std::unordered_map<std::string, int> var_ids; std::unordered_map<Node*, int> var_ids;
// Numbering input vars. // Numbering input vars.
for (auto* in : input_var_nodes) { for (auto* in : input_var_nodes) {
VLOG(3) << "Encoding input names:" << in->Name() << ", id:" << id; VLOG(3) << "Encoding input names:" << in->Name() << "(" << in
if (var_ids.find(in->Name()) == var_ids.end()) { << "), id:" << id;
var_ids[in->Name()] = id++; if (var_ids.find(in) == var_ids.end()) {
var_ids[in] = id++;
} }
} }
// Encoding output vars. // Encoding output vars.
for (auto* out : output_var_nodes) { for (auto* out : output_var_nodes) {
VLOG(3) << "Ecoding output names:" << out->Name() << ", id:" << id; VLOG(3) << "Ecoding output names:" << out->Name() << "(" << out
if (var_ids.find(out->Name()) == var_ids.end()) { << "), id:" << id;
var_ids[out->Name()] = id++; if (var_ids.find(out) == var_ids.end()) {
var_ids[out] = id++;
} }
} }
return var_ids; return var_ids;
......
...@@ -61,7 +61,7 @@ class CodeGenerator { ...@@ -61,7 +61,7 @@ class CodeGenerator {
const std::unordered_map<int, std::string>& dtypes) const; const std::unordered_map<int, std::string>& dtypes) const;
// Encode all var nodes in the subgraph with an unique number. // Encode all var nodes in the subgraph with an unique number.
std::unordered_map<std::string, int> EncodeVarNodes(SubGraph* subgraph); std::unordered_map<Node*, int> EncodeVarNodes(SubGraph* subgraph);
private: private:
std::vector<CodeTemplate> code_templates_; std::vector<CodeTemplate> code_templates_;
......
...@@ -48,20 +48,20 @@ class OperationExpression { ...@@ -48,20 +48,20 @@ class OperationExpression {
std::string op_type, const std::vector<int>& input_ids, std::string op_type, const std::vector<int>& input_ids,
const std::vector<int>& output_ids, std::string rhs_type, const std::vector<int>& output_ids, std::string rhs_type,
std::string lhs_type, std::string lhs_type,
const std::unordered_map<int, bool>& intermediate_state = {}) const std::vector<int>& intermediate_output_ids = {})
: op_type_(op_type), : op_type_(op_type),
input_ids_(input_ids), input_ids_(input_ids),
output_ids_(output_ids), output_ids_(output_ids),
rhs_type_(rhs_type), rhs_type_(rhs_type),
lhs_type_(lhs_type), lhs_type_(lhs_type),
intermediate_state_(intermediate_state) {} intermediate_output_ids_(intermediate_output_ids) {}
std::string GetOpType() const { return op_type_; } std::string GetOpType() const { return op_type_; }
std::unordered_map<int, bool> GetIntermediateState() const {
return intermediate_state_;
}
std::vector<int> GetInputIds() const { return input_ids_; } std::vector<int> GetInputIds() const { return input_ids_; }
std::vector<int> GetOutputIds() const { return output_ids_; } std::vector<int> GetOutputIds() const { return output_ids_; }
std::vector<int> GetIntermediateOutputIds() const {
return intermediate_output_ids_;
}
std::string GetRHSType() const { return rhs_type_; } std::string GetRHSType() const { return rhs_type_; }
std::string GetLHSType() const { return lhs_type_; } std::string GetLHSType() const { return lhs_type_; }
void SetAttr(AttributeMap attr) { attr_ = attr; } void SetAttr(AttributeMap attr) { attr_ = attr; }
...@@ -84,7 +84,7 @@ class OperationExpression { ...@@ -84,7 +84,7 @@ class OperationExpression {
AttributeMap attr_; AttributeMap attr_;
std::string rhs_type_; std::string rhs_type_;
std::string lhs_type_; std::string lhs_type_;
std::unordered_map<int, bool> intermediate_state_; std::vector<int> intermediate_output_ids_;
}; };
class TemplateVariable { class TemplateVariable {
......
...@@ -144,7 +144,6 @@ void CheckOutput(const std::vector<OperationExpression>& expressions, ...@@ -144,7 +144,6 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
LOG(INFO) << "Precision check failed from i = " << id LOG(INFO) << "Precision check failed from i = " << id
<< ", expect: " << expect << ", actual: " << actual; << ", expect: " << expect << ", actual: " << actual;
EXPECT_LT(fabs(actual - expect), eps); EXPECT_LT(fabs(actual - expect), eps);
break;
} }
} }
} }
...@@ -465,7 +464,7 @@ TEST(code_generator, subgraph) { ...@@ -465,7 +464,7 @@ TEST(code_generator, subgraph) {
for (std::string dtype : {"float", "__half"}) { for (std::string dtype : {"float", "__half"}) {
std::unique_ptr<paddle::framework::ir::Graph> graph = std::unique_ptr<paddle::framework::ir::Graph> graph =
BuildGraph(false, dtype); BuildGraph(false, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false, fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
graph->Nodes()); graph->Nodes());
// Expressions generated by code_generator (they may be different): // Expressions generated by code_generator (they may be different):
...@@ -484,7 +483,7 @@ TEST(code_generator, subgraph_grad) { ...@@ -484,7 +483,7 @@ TEST(code_generator, subgraph_grad) {
for (std::string dtype : {"float", "__half"}) { for (std::string dtype : {"float", "__half"}) {
std::unique_ptr<paddle::framework::ir::Graph> graph = std::unique_ptr<paddle::framework::ir::Graph> graph =
BuildGraph(true, dtype); BuildGraph(true, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false, fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
DistilGradNodes(graph)); DistilGradNodes(graph));
// Expressions generated by code_generator (they may be different): // Expressions generated by code_generator (they may be different):
......
...@@ -63,7 +63,7 @@ static bool IsEqualAndNotEmpty(const std::vector<int64_t>& l, ...@@ -63,7 +63,7 @@ static bool IsEqualAndNotEmpty(const std::vector<int64_t>& l,
bool GroupDetector::CheckPrecondition(const Node* n) { bool GroupDetector::CheckPrecondition(const Node* n) {
auto check_data_type = [&](const std::vector<Node*>& nodes) -> bool { auto check_data_type = [&](const std::vector<Node*>& nodes) -> bool {
bool is_first = true; bool is_first = true;
proto::VarType::Type data_type_0; proto::VarType::Type data_type_0 = proto::VarType::BOOL;
for (auto* n : nodes) { for (auto* n : nodes) {
if (n && n->IsVar() && n->Var()) { if (n && n->IsVar() && n->Var()) {
if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) { if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) {
......
...@@ -63,11 +63,6 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { ...@@ -63,11 +63,6 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
std::unordered_set<Node*>(vec.begin(), vec.end())); std::unordered_set<Node*>(vec.begin(), vec.end()));
VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n"; VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n";
// In elementwise fused kernel, memory is the bound of execution,
// here we remove the output id to use less memory and less time.
if (subgraph.RemoveIntermediateOut()) {
subgraph.DetectIntermediateOutWithGraph(graph);
}
if (subgraph.IsValid(min_subgraph_size)) { if (subgraph.IsValid(min_subgraph_size)) {
subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++)); subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
if (GenerateCode(&subgraph)) { if (GenerateCode(&subgraph)) {
...@@ -115,57 +110,52 @@ static int ExtractOpRole(fusion_group::SubGraph* subgraph) { ...@@ -115,57 +110,52 @@ static int ExtractOpRole(fusion_group::SubGraph* subgraph) {
void FusionGroupPass::InsertFusionGroupOp( void FusionGroupPass::InsertFusionGroupOp(
Graph* graph, fusion_group::SubGraph* subgraph) const { Graph* graph, fusion_group::SubGraph* subgraph) const {
const std::vector<Node*>& input_vars_of_subgraph = const std::vector<Node*>& input_vars = subgraph->GetInputVarNodes();
subgraph->GetInputVarNodes(); const std::vector<Node*>& output_vars =
const std::vector<Node*>& output_vars_of_subgraph = subgraph->GetOutputVarNodes(subgraph->SaveIntermediateOut());
subgraph->GetOutputVarNodes();
const std::vector<Node*> intermediate_vars_of_subgraph =
subgraph->GetIntermediateOutVarNodes();
std::unordered_set<Node*> external_nodes; std::unordered_set<Node*> external_nodes;
OpDesc op_desc; // Prepare inputs.
op_desc.SetType("fusion_group");
std::vector<std::string> input_names; std::vector<std::string> input_names;
std::vector<std::string> inputs_data_types; std::vector<int> input_dtypes;
for (auto* n : input_vars_of_subgraph) { std::unordered_set<Node*> output_vars_set(output_vars.begin(),
input_names.push_back(n->Name()); output_vars.end());
inputs_data_types.push_back(DataTypeToString(n->Var()->GetDataType())); for (auto* n : input_vars) {
external_nodes.insert(n); // It is not an output var node.
if (output_vars_set.find(n) == output_vars_set.end()) {
input_names.push_back(n->Name());
input_dtypes.push_back(n->Var()->GetDataType());
external_nodes.insert(n);
}
} }
op_desc.SetInput("Inputs", input_names);
// Prepare outputs.
std::vector<std::string> output_names; std::vector<std::string> output_names;
std::vector<std::string> outs_data_types; std::vector<int> output_dtypes;
std::vector<Node*> output_var_without_intermediate; for (auto* n : output_vars) {
for (auto* n : output_vars_of_subgraph) { output_names.push_back(n->Name());
auto it_input = output_dtypes.push_back(n->Var()->GetDataType());
find(input_vars_of_subgraph.begin(), input_vars_of_subgraph.end(), n);
auto it_intermediate = find(intermediate_vars_of_subgraph.begin(),
intermediate_vars_of_subgraph.end(), n);
if (it_intermediate == intermediate_vars_of_subgraph.end() &&
it_input == input_vars_of_subgraph.end()) {
output_names.push_back(n->Name());
outs_data_types.push_back(DataTypeToString(n->Var()->GetDataType()));
output_var_without_intermediate.push_back(n);
}
external_nodes.insert(n); external_nodes.insert(n);
} }
OpDesc op_desc;
op_desc.SetType("fusion_group");
op_desc.SetInput("Inputs", input_names);
op_desc.SetOutput("Outs", output_names); op_desc.SetOutput("Outs", output_names);
op_desc.SetAttr("inputs_data_type", inputs_data_types); op_desc.SetAttr("inputs_dtype", input_dtypes);
op_desc.SetAttr("outs_data_type", outs_data_types); op_desc.SetAttr("outs_dtype", output_dtypes);
op_desc.SetAttr("type", subgraph->GetType()); op_desc.SetAttr("type", subgraph->GetType());
op_desc.SetAttr("func_name", subgraph->GetFuncName()); op_desc.SetAttr("func_name", subgraph->GetFuncName());
op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
ExtractOpRole(subgraph)); ExtractOpRole(subgraph));
Node* fusion_group_node = graph->CreateOpNode(&op_desc); Node* fusion_group_node = graph->CreateOpNode(&op_desc);
for (auto* in : input_vars_of_subgraph) { for (auto* in : input_vars) {
IR_NODE_LINK_TO(in, fusion_group_node); if (output_vars_set.find(in) == output_vars_set.end()) {
IR_NODE_LINK_TO(in, fusion_group_node);
}
} }
for (auto* out : output_vars) {
for (auto* out : output_var_without_intermediate) {
IR_NODE_LINK_TO(fusion_group_node, out); IR_NODE_LINK_TO(fusion_group_node, out);
} }
......
...@@ -105,12 +105,6 @@ void OperationMap::InsertUnaryElementwiseOperations() { ...@@ -105,12 +105,6 @@ void OperationMap::InsertUnaryElementwiseOperations() {
insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}", insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}",
{"${2} * (%{1.0} - ${1} * ${1})"}); {"${2} * (%{1.0} - ${1} * ${1})"});
// cast:
// out = static_cast<T>(x)
// TODO(wangchaochaohu): This is not the compelete definition of
// cast Op, We need refine it later.
insert_handler("cast", "${0}", {});
// sqrt: // sqrt:
// out = x^(1/2) // out = x^(1/2)
// dx = dout * 0.5 / out // dx = dout * 0.5 / out
...@@ -121,11 +115,21 @@ void OperationMap::InsertUnaryElementwiseOperations() { ...@@ -121,11 +115,21 @@ void OperationMap::InsertUnaryElementwiseOperations() {
// dx = dout * 2.0 * x // dx = dout * 2.0 * x
insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"}); insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"});
// assign:
// out = x
insert_handler("assign", "${0}", {});
// cast:
// out = static_cast<T>(x)
// TODO(wangchaochaohu): This is not the compelete definition of
// cast Op, We need refine it later.
insert_handler("cast", "${0}", {});
// scale // scale
// out = (bias_after_scale) ? scale * X + bias : scale(X + bias) // out = (bias_after_scale) ? scale * X + bias : scale(X + bias)
// here we use '=' operator to seperate th default value // here we use '=' operator to seperate th default value
// TODO(wangchaochaohu): Later we need to support Tensor input for scale and // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
// bias. // bias.
insert_handler( insert_handler(
"scale", "scale",
"${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + " "${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + "
......
...@@ -66,11 +66,12 @@ class SubGraph { ...@@ -66,11 +66,12 @@ class SubGraph {
} }
int GetType() const { return type_; } int GetType() const { return type_; }
bool RemoveIntermediateOut() { return !save_intermediate_out_; }
void SetFuncName(std::string func_name) { func_name_ = func_name; } void SetFuncName(std::string func_name) { func_name_ = func_name; }
std::string GetFuncName() const { return func_name_; } std::string GetFuncName() const { return func_name_; }
bool SaveIntermediateOut() const { return save_intermediate_out_; }
const std::unordered_set<Node*>& Nodes() const { return nodes_set_; } const std::unordered_set<Node*>& Nodes() const { return nodes_set_; }
const std::vector<Node*>& SortedNodes() { const std::vector<Node*>& SortedNodes() {
if (!is_sorted_) { if (!is_sorted_) {
...@@ -118,66 +119,88 @@ class SubGraph { ...@@ -118,66 +119,88 @@ class SubGraph {
return input_vars; return input_vars;
} }
std::vector<Node*> GetOutputVarNodes() { std::vector<Node*> GetOutputVarNodes(bool with_intermediate_out) {
// The order of output nodes should be consistant anywhere.. // The order of output nodes should be consistant anywhere..
std::vector<Node*> output_vars_all; std::vector<Node*> output_vars;
for (auto* n : SortedNodes()) { for (auto* n : SortedNodes()) {
if (n && n->IsVar() && n->Var()) { if (IsOutputOfInternalOp(n)) {
// If the var_node is the output of some op_node in the subgraph, it // If the var_node is the output of some op_node in the subgraph, it
// is considered the output var node of the subgraph. // is considered the output var node of the subgraph.
bool is_found = false; if (with_intermediate_out) {
for (auto* in : n->inputs) { output_vars.push_back(n);
if (Has(in)) { } else {
is_found = true; if (n->outputs.empty() || IsInputOfExternalOp(n)) {
output_vars.push_back(n);
} }
} }
if (is_found) {
output_vars_all.push_back(n);
}
} }
} }
return output_vars_all; return output_vars;
} }
std::vector<Node*> GetIntermediateOutVarNodes() { std::vector<Node*> GetIntermediateOutVarNodes() {
return intermediate_out_nodes_; // Intermediate output var nodes: the output of some op_node in the
// subgraph, but not referenced outside the subgraph.
std::vector<Node*> intermediate_out_vars;
for (auto* n : SortedNodes()) {
if (IsOutputOfInternalOp(n) && IsInputOfInternalOp(n) &&
!IsInputOfExternalOp(n)) {
// When the outputs size is 0, it is also considered a intermidiate
// output. It maybe an unused output or the fetching vars, so that we
// cannot eleiminate it directly here.
intermediate_out_vars.push_back(n);
}
}
return intermediate_out_vars;
} }
void DetectIntermediateOutWithGraph(Graph* graph) { std::unordered_set<Node*> GetIntermediateOutVarNodesSet() {
auto graph_nodes = graph->Nodes(); std::vector<Node*> intermediate_out_vars = GetIntermediateOutVarNodes();
return std::unordered_set<Node*>(intermediate_out_vars.begin(),
for (auto* n : SortedNodes()) { intermediate_out_vars.end());
bool enable_remove = true; }
if (n && n->IsVar() && n->Var()) { private:
bool leaf_graph = true; bool IsInputOfInternalOp(Node* n) {
for (auto* node : graph_nodes) { bool is_input_of_internal_op = false;
if (node->IsOp()) { if (Has(n) && n && n->IsVar() && n->Var()) {
auto inputs = node->inputs; for (auto* out : n->outputs) {
for (auto* in : inputs) { if (Has(out)) {
if (in && in->Name() == n->Name()) { is_input_of_internal_op = true;
if (!Has(node)) enable_remove = false; break;
leaf_graph = false;
}
}
}
if (!enable_remove) {
break;
}
} }
if (leaf_graph) enable_remove = false; }
}
return is_input_of_internal_op;
}
} else { bool IsInputOfExternalOp(Node* n) {
enable_remove = false; // If n is the input any one node outside the subgraph.
bool is_input_of_external_op = false;
if (Has(n) && n && n->IsVar() && n->Var()) {
for (auto* out : n->outputs) {
if (!Has(out)) {
is_input_of_external_op = true;
break;
}
} }
}
return is_input_of_external_op;
}
if (enable_remove) { bool IsOutputOfInternalOp(Node* n) {
intermediate_out_nodes_.push_back(n); bool is_output_of_internal_op = false;
if (Has(n) && n && n->IsVar() && n->Var()) {
for (auto* in : n->inputs) {
if (Has(in)) {
is_output_of_internal_op = true;
break;
}
} }
} }
return is_output_of_internal_op;
} }
private:
void TopologicalSort() { void TopologicalSort() {
if (!is_sorted_) { if (!is_sorted_) {
std::unordered_map<Node*, std::vector<Node*>> inputs_map; std::unordered_map<Node*, std::vector<Node*>> inputs_map;
...@@ -236,7 +259,6 @@ class SubGraph { ...@@ -236,7 +259,6 @@ class SubGraph {
bool save_intermediate_out_{true}; bool save_intermediate_out_{true};
std::unordered_set<Node*> nodes_set_; std::unordered_set<Node*> nodes_set_;
std::vector<Node*> intermediate_out_nodes_{};
bool is_sorted_{false}; bool is_sorted_{false};
std::vector<Node*> sorted_nodes_; std::vector<Node*> sorted_nodes_;
}; };
......
...@@ -1879,6 +1879,19 @@ PDNode *patterns::MultipleQuantize::operator()() { ...@@ -1879,6 +1879,19 @@ PDNode *patterns::MultipleQuantize::operator()() {
return prev_out; return prev_out;
} }
PDNode *patterns::QuantizePlacement::operator()(
const std::unordered_set<std::string> &quantize_enabled_op_types) {
std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
"fc", "matmul", "pool2d", "prior_box",
"relu", "reshape2", "transpose2"});
if (!quantize_enabled_op_types.empty()) {
supported_op_types = quantize_enabled_op_types;
}
auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
return op;
}
PDNode *patterns::MKLDNNInPlace::operator()() { PDNode *patterns::MKLDNNInPlace::operator()() {
const std::unordered_set<std::string> &supported_op_types = { const std::unordered_set<std::string> &supported_op_types = {
"abs", "abs",
......
...@@ -1120,6 +1120,15 @@ struct MultipleQuantize : public PatternBase { ...@@ -1120,6 +1120,15 @@ struct MultipleQuantize : public PatternBase {
PATTERN_DECL_NODE(prev_out); PATTERN_DECL_NODE(prev_out);
}; };
struct QuantizePlacement : public PatternBase {
QuantizePlacement(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "quantize_placement") {}
PDNode* operator()(
const std::unordered_set<std::string>& quantize_enabled_op_types);
PATTERN_DECL_NODE(op);
};
// Pattern used for enforcing inplace computation for in-place computation // Pattern used for enforcing inplace computation for in-place computation
// supporting DNNL ops. softmax, batch_norm and layer_norm // supporting DNNL ops. softmax, batch_norm and layer_norm
struct MKLDNNInPlace : public PatternBase { struct MKLDNNInPlace : public PatternBase {
......
...@@ -26,30 +26,33 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { ...@@ -26,30 +26,33 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
Get<std::unordered_set<int>>("quantize_excluded_op_ids"); Get<std::unordered_set<int>>("quantize_excluded_op_ids");
const auto& op_types_list = const auto& op_types_list =
Get<std::unordered_set<std::string>>("quantize_enabled_op_types"); Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
for (const Node* n : graph->Nodes()) { Init(name_scope_, graph);
if (n->IsOp()) { GraphPatternDetector gpd;
if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(), patterns::QuantizePlacement quantize_placement_pattern{gpd.mutable_pattern(),
n->id()) != excluded_ids_list.end()) "quantize_placement"};
continue; quantize_placement_pattern(op_types_list);
auto* op = n->Op();
if (op->HasAttr("mkldnn_data_type") || auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
op->HasProtoAttr("mkldnn_data_type")) { Graph* g) {
// use_quantizer is no longer used GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern);
// assign value for compatibility
if (op->GetAttrIfExists<bool>("use_quantizer")) { if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
op->SetAttr("mkldnn_data_type", std::string("int8")); op->id()) != excluded_ids_list.end()) {
} return;
if (op_types_list.empty()) { }
op->SetAttr("mkldnn_data_type", std::string("int8"));
op->SetAttr("use_quantizer", true); if (op->Op()->HasAttr("mkldnn_data_type") ||
} else if (std::find(op_types_list.begin(), op_types_list.end(), op->Op()->HasProtoAttr("mkldnn_data_type")) {
op->Type()) != op_types_list.end()) { // use_quantizer is no longer used
op->SetAttr("mkldnn_data_type", std::string("int8")); // assign value for compatibility
op->SetAttr("use_quantizer", true); if (op->Op()->GetAttrIfExists<bool>("use_quantizer")) {
} op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
} }
op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
op->Op()->SetAttr("use_quantizer", true);
} }
} };
gpd(graph, handler);
} }
} // namespace ir } // namespace ir
......
...@@ -15,7 +15,10 @@ limitations under the License. */ ...@@ -15,7 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include "paddle/fluid/framework/ir/pass.h" #include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -23,9 +26,10 @@ namespace ir { ...@@ -23,9 +26,10 @@ namespace ir {
/* /*
* Specifies which operators should be quantized. * Specifies which operators should be quantized.
*/ */
class CPUQuantizePlacementPass : public Pass { class CPUQuantizePlacementPass : public FusePassBase {
protected: protected:
void ApplyImpl(ir::Graph* graph) const override; void ApplyImpl(ir::Graph* graph) const override;
const std::string name_scope_{"cpu_quantize_placement_pass"};
}; };
} // namespace ir } // namespace ir
......
...@@ -130,7 +130,7 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) { ...@@ -130,7 +130,7 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
MainTest({"conv2d"}, {4}, 1); MainTest({"conv2d"}, {4}, 1);
} }
TEST(QuantizerPlacementPass, excluded_none) { TEST(QuantizerPlacementPass, empty_list) {
// all operators quantized // all operators quantized
MainTest({}, {}, 6); MainTest({}, {}, 6);
} }
......
...@@ -81,7 +81,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope, ...@@ -81,7 +81,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
if (quantized_op_type == "conv2d" || if (quantized_op_type == "conv2d" ||
quantized_op_type == "conv2d_fusion" || quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "depthwise_conv2d" || quantized_op_type == "depthwise_conv2d" ||
quantized_op_type == "fc") { quantized_op_type == "fc" ||
quantized_op_type == "conv2d_transpose") {
op_desc->SetAttr("Input_scale", scale_value); op_desc->SetAttr("Input_scale", scale_value);
} else if (quantized_op_type == "mul") { } else if (quantized_op_type == "mul") {
op_desc->SetAttr("X_scale", scale_value); op_desc->SetAttr("X_scale", scale_value);
...@@ -111,7 +112,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -111,7 +112,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
std::string input_name = ""; std::string input_name = "";
if (quantized_op_type == "conv2d" || if (quantized_op_type == "conv2d" ||
quantized_op_type == "depthwise_conv2d" || quantized_op_type == "depthwise_conv2d" ||
quantized_op_type == "conv2d_fusion") { quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "conv2d_transpose") {
weight_name = "Filter"; weight_name = "Filter";
input_name = "Input"; input_name = "Input";
} else if (quantized_op_type == "mul") { } else if (quantized_op_type == "mul") {
...@@ -122,7 +124,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -122,7 +124,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
input_name = "Input"; input_name = "Input";
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for " "QuantDequantFuse: We only support conv2d, conv2d_fusion, "
"conv2d_transpose, fc, mul for "
"now.")); "now."));
} }
const std::string pattern_name = "dequant_fuse"; const std::string pattern_name = "dequant_fuse";
...@@ -192,10 +195,12 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -192,10 +195,12 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>(); scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
auto w_dims = weight_tensor->dims(); auto w_dims = weight_tensor->dims();
// If quantized op is fc, weight scale size = 1; // If quantized op is fc, weight scale size = 1;
// If quantized op is conv, weight scale size = weight dims[0] // If quantized op is conv2d, weight scale size = weight dims[0]
// If quantized op is conv2d_transpose, weight scale size = weight dims[1]
bool valid_scale_size = bool valid_scale_size =
(weight_scale.size() == 1 || (weight_scale.size() == 1 ||
weight_scale.size() == static_cast<size_t>(w_dims[0])); weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
weight_scale.size() == static_cast<size_t>(w_dims[1]));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
valid_scale_size, true, valid_scale_size, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -206,8 +211,14 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -206,8 +211,14 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
if (weight_scale.size() == 1) { if (weight_scale.size() == 1) {
quantized_weight_data[j] *= weight_scale[0]; quantized_weight_data[j] *= weight_scale[0];
} else { } else {
int inner_size = w_dims[1] * w_dims[2] * w_dims[3]; if (quantized_op_type == "conv2d_transpose") {
quantized_weight_data[j] *= weight_scale[j / inner_size]; int inner_size = w_dims[2] * w_dims[3];
quantized_weight_data[j] *=
weight_scale[(j / inner_size) % w_dims[1]];
} else {
int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
quantized_weight_data[j] *= weight_scale[j / inner_size];
}
} }
} }
...@@ -220,7 +231,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -220,7 +231,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
new_op_desc.SetType(quantized_op_type); new_op_desc.SetType(quantized_op_type);
new_op_desc.SetAttr("enable_int8", true); new_op_desc.SetAttr("enable_int8", true);
if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" || if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "depthwise_conv2d") { quantized_op_type == "depthwise_conv2d" ||
quantized_op_type == "conv2d_transpose") {
new_op_desc.SetInput("Input", {new_input}); new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Output", {new_output}); new_op_desc.SetOutput("Output", {new_output});
} else if (quantized_op_type == "fc") { } else if (quantized_op_type == "fc") {
...@@ -253,7 +265,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -253,7 +265,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
std::unordered_set<std::string> quant_types = { std::unordered_set<std::string> quant_types = {
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
std::unordered_set<std::string> quantized_op_types = { std::unordered_set<std::string> quantized_op_types = {
"conv2d", "mul", "depthwise_conv2d", "fc"}; "conv2d", "mul", "depthwise_conv2d", "fc", "conv2d_transpose"};
auto* scope = param_scope(); auto* scope = param_scope();
for (auto& quant_type : quant_types) { for (auto& quant_type : quant_types) {
......
...@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() { ...@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
BriefNode *brief_node = itr.second; BriefNode *brief_node = itr.second;
if (!Agent(brief_node->node).marked()) { if (!Agent(brief_node->node).marked()) {
VLOG(4) << brief_node->node->id() << " node not a trt candidate."; VLOG(4) << brief_node->node->id() << " node named "
<< brief_node->node->Name() << " is not a trt candidate.";
continue; continue;
} }
......
...@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { ...@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
// CPU, CUDA, PLAIN are same library type. // CPU, CUDA, PLAIN are same library type.
} else if (s == std::string("CPU")) { } else if (s == std::string("CPU")) {
return LibraryType::kPlain; return LibraryType::kPlain;
} else if (s == std::string("XPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) { } else if (s == std::string("CUDA")) {
return LibraryType::kPlain; return LibraryType::kPlain;
} else { } else {
......
...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include <algorithm> #include <algorithm>
#include <functional> #include <functional>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_call_stack.h" #include "paddle/fluid/framework/op_call_stack.h"
...@@ -51,23 +53,62 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -51,23 +53,62 @@ class CompileTimeInferShapeContext : public InferShapeContext {
std::vector<std::string> Outputs(const std::string &name) const override; std::vector<std::string> Outputs(const std::string &name) const override;
std::string GetInputNameByIdx(size_t idx) const override {
auto &op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of inputs of "
"operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->inputs().size()));
return op_proto->inputs()[idx].name();
}
std::string GetOutputNameByIdx(size_t idx) const override {
auto &op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT(
idx, op_proto->outputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of outputs of "
"operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->outputs().size()));
return op_proto->outputs()[idx].name();
}
void ShareDim(const std::string &in, const std::string &out, size_t i = 0, void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
size_t j = 0) override { size_t j = 0) override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(i, Inputs(in).size(),
PADDLE_ENFORCE_LT(j, Outputs(out).size()); platform::errors::InvalidArgument(
"The input variable index is out of range, expected "
"index less than %d, but received index is %d.",
Inputs(in).size(), i));
PADDLE_ENFORCE_LT(j, Outputs(out).size(),
platform::errors::InvalidArgument(
"The output variable index is out of range, expected "
"index less than %d, but received index is %d.",
Outputs(out).size(), j));
std::string input_n = Inputs(in)[i]; std::string input_n = Inputs(in)[i];
std::string output_n = Outputs(out)[j]; std::string output_n = Outputs(out)[j];
PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@", PADDLE_ENFORCE_NE(input_n, framework::kEmptyVarName,
in, i); platform::errors::InvalidArgument(
PADDLE_ENFORCE(output_n != framework::kEmptyVarName, "The input variable %s[%d] is empty.", in, i));
"The %s[%d] is @EMPTY@", out, j); PADDLE_ENFORCE_NE(output_n, framework::kEmptyVarName,
platform::errors::InvalidArgument(
"The output variable %s[%d] is empty.", out, j));
auto *in_var = block_.FindVarRecursive(input_n); auto *in_var = block_.FindVarRecursive(input_n);
auto *out_var = block_.FindVarRecursive(output_n); auto *out_var = block_.FindVarRecursive(output_n);
PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(), PADDLE_ENFORCE_EQ(
"The type of %s and %s is not the same.", input_n, output_n); in_var->GetType(), out_var->GetType(),
platform::errors::InvalidArgument(
"The type of input %s and output %s do not match. The input type "
"is %s, output type is %s.",
input_n, output_n, DataTypeToString(in_var->GetType()),
DataTypeToString(out_var->GetType())));
SetDim(output_n, GetDim(input_n)); SetDim(output_n, GetDim(input_n));
} }
...@@ -101,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -101,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void ShareLoD(const std::string &in, const std::string &out, size_t i = 0, void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
size_t j = 0) const override { size_t j = 0) const override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(i, Inputs(in).size(),
PADDLE_ENFORCE_LT(j, Outputs(out).size()); platform::errors::InvalidArgument(
PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName, "The input variable index is out of range, expected "
"The %s[%d] is @EMPTY@", in, i); "index less than %d, but received index is %d.",
PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName, Inputs(in).size(), i));
"The %s[%d] is @EMPTY@", out, j); PADDLE_ENFORCE_LT(j, Outputs(out).size(),
platform::errors::InvalidArgument(
"The output variable index is out of range, expected "
"index less than %d, but received index is %d.",
Outputs(out).size(), j));
PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
platform::errors::InvalidArgument(
"The input variable %s[%d] is empty.", in, i));
PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
platform::errors::InvalidArgument(
"The output variable %s[%d] is empty.", out, j));
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != proto::VarType::LOD_TENSOR && if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
...@@ -119,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -119,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext {
int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override { int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override {
PADDLE_ENFORCE_LT(i, Inputs(in).size(), PADDLE_ENFORCE_LT(i, Inputs(in).size(),
"Input %s of operator %s only has %d elements.", in, platform::errors::InvalidArgument(
op_.Type(), Inputs(in).size()); "The input variable index is out of range, input "
"variable %s of operator %s only has %d elements.",
in, op_.Type(), Inputs(in).size()));
PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName, PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
"Input %s[%d] of operator %s is @EMPTY@", in, op_.Type(), platform::errors::InvalidArgument(
i); "The input variable %s[%d] of operator %s is empty.",
in, i, op_.Type()));
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
in_var, "Input %s[%d] of operator %s should not be nullptr.", in, in_var, platform::errors::NotFound(
op_.Type(), i); "The input variable %s[%d] of operator %s is not found.",
in, i, op_.Type()));
return in_var->GetLoDLevel(); return in_var->GetLoDLevel();
} }
void SetLoDLevel(const std::string &out, int32_t lod_level, void SetLoDLevel(const std::string &out, int32_t lod_level,
size_t j = 0) const override { size_t j = 0) const override {
PADDLE_ENFORCE_LT(j, Outputs(out).size(), PADDLE_ENFORCE_LT(j, Outputs(out).size(),
"Output %s of operator %s only has %d elements.", out, platform::errors::InvalidArgument(
op_.Type(), Outputs(out).size()); "The output variable index is out of range, output "
"variable %s of operator %s only has %d elements.",
out, op_.Type(), Outputs(out).size()));
PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName, PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
"Output %s[%d] of operator %s is @EMPTY@", out, platform::errors::InvalidArgument(
op_.Type(), j); "The output variable %s[%d] of operator %s is empty.",
out, j, op_.Type()));
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
out_var, "Output %s[%d] of operator %s should not be nullptr.", out, out_var, platform::errors::NotFound(
op_.Type(), j); "The output variable %s[%d] of operator %s is not found.",
out, j, op_.Type()));
if (lod_level >= 0) { if (lod_level >= 0) {
out_var->SetLoDLevel(lod_level); out_var->SetLoDLevel(lod_level);
} }
...@@ -175,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -175,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
DDim GetInputDim(const std::string &name) const override { DDim GetInputDim(const std::string &name) const override {
const std::vector<std::string> &arg_names = Inputs(name); const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d", platform::errors::InvalidArgument(
name, arg_names.size()); "The input(%s) should hold only one element, but now "
"it holds %d elements.",
name, arg_names.size()));
return this->GetDim(arg_names[0]); return this->GetDim(arg_names[0]);
} }
...@@ -200,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -200,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void SetOutputDim(const std::string &name, const DDim &dim) override { void SetOutputDim(const std::string &name, const DDim &dim) override {
auto arg_names = Outputs(name); auto arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d", platform::errors::InvalidArgument(
name, arg_names.size()); "The iutput(%s) should hold only one element, but "
"now it holds %d elements.",
name, arg_names.size()));
SetDim(arg_names[0], dim); SetDim(arg_names[0], dim);
} }
...@@ -227,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -227,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
DDim GetDim(const std::string &name) const { DDim GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found.", name));
DDim res; DDim res;
try { try {
auto shape = var->GetShape(); auto shape = var->GetShape();
...@@ -253,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -253,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims) { const std::vector<DDim> &dims) {
size_t length = names.size(); size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size()); PADDLE_ENFORCE_EQ(length, dims.size(),
platform::errors::InvalidArgument(
"The input variables number(%d) and input dimensions "
"number(%d) do not match.",
length, dims.size()));
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) { if (names[i] == framework::kEmptyVarName) {
continue; continue;
...@@ -339,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() { ...@@ -339,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() {
const std::vector<std::string> &OpDesc::Input(const std::string &name) const { const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
auto it = inputs_.find(name); auto it = inputs_.find(name);
PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name, PADDLE_ENFORCE_NE(
Type()); it, inputs_.end(),
platform::errors::NotFound("Input %s cannot be found in operator %s.",
name, Type()));
return it->second; return it->second;
} }
...@@ -360,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name, ...@@ -360,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name,
const std::vector<std::string> &OpDesc::Output(const std::string &name) const { const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
auto it = outputs_.find(name); auto it = outputs_.find(name);
PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s", PADDLE_ENFORCE_NE(
name, Type()); it, outputs_.end(),
platform::errors::NotFound("Output %s cannot be found in operator %s.",
name, Type()));
return it->second; return it->second;
} }
...@@ -402,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const { ...@@ -402,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
proto::AttrType OpDesc::GetAttrType(const std::string &name) const { proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
"Attribute %s is not found.", name));
return static_cast<proto::AttrType>(it->second.which() - 1); return static_cast<proto::AttrType>(it->second.which() - 1);
} }
...@@ -467,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { ...@@ -467,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
return; return;
} }
default: default:
PADDLE_THROW("Wrong attr type %d", attr.type()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported attribute type (code %d).", attr.type()));
} }
need_update_ = true; need_update_ = true;
return; return;
...@@ -504,7 +578,8 @@ void OpDesc::SetAttrMap( ...@@ -504,7 +578,8 @@ void OpDesc::SetAttrMap(
Attribute OpDesc::GetAttr(const std::string &name) const { Attribute OpDesc::GetAttr(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
"Attribute %s is not found.", name));
return it->second; return it->second;
} }
...@@ -518,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr( ...@@ -518,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr(
} }
} }
PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type()); PADDLE_THROW(platform::errors::NotFound(
"Attribute %s is not found in proto %s.", name, proto.type()));
} }
Attribute OpDesc::GetNullableAttr(const std::string &name) const { Attribute OpDesc::GetNullableAttr(const std::string &name) const {
...@@ -532,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const { ...@@ -532,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const { std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(
it, attrs_.end(),
platform::errors::NotFound(
"Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second); auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second);
std::vector<int> ids; std::vector<int> ids;
...@@ -545,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const { ...@@ -545,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
int OpDesc::GetBlockAttrId(const std::string &name) const { int OpDesc::GetBlockAttrId(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(
it, attrs_.end(),
platform::errors::NotFound(
"Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
return BOOST_GET_CONST(BlockDesc *, it->second)->ID(); return BOOST_GET_CONST(BlockDesc *, it->second)->ID();
} }
...@@ -632,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { ...@@ -632,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
VectorToRepeated(v, attr_->mutable_longs()); VectorToRepeated(v, attr_->mutable_longs());
} }
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } void operator()(boost::blank) const {
PADDLE_THROW(platform::errors::Unavailable(
"Unsupported calling method of SetAttrDescVisitor object for "
"`boosst::blank` type."));
}
}; };
void OpDesc::Flush() { void OpDesc::Flush() {
...@@ -666,8 +752,9 @@ void OpDesc::Flush() { ...@@ -666,8 +752,9 @@ void OpDesc::Flush() {
} }
void OpDesc::CheckAttrs() { void OpDesc::CheckAttrs() {
PADDLE_ENFORCE(!Type().empty(), PADDLE_ENFORCE_EQ(Type().empty(), false,
"CheckAttr() can not be called before type is set."); platform::errors::PreconditionNotMet(
"CheckAttrs() can not be called before type is set."));
auto *checker = OpInfoMap::Instance().Get(Type()).Checker(); auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
if (checker == nullptr) { if (checker == nullptr) {
// checker is not configured. That operator could be generated by Paddle, // checker is not configured. That operator could be generated by Paddle,
...@@ -682,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const { ...@@ -682,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const {
try { try {
VLOG(3) << "CompileTime infer shape on " << Type(); VLOG(3) << "CompileTime infer shape on " << Type();
auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
PADDLE_ENFORCE(static_cast<bool>(infer_shape), PADDLE_ENFORCE_EQ(
"%s's infer_shape has not been registered", this->Type()); static_cast<bool>(infer_shape), true,
platform::errors::NotFound(
"Operator %s's infer_shape is not registered.", this->Type()));
CompileTimeInferShapeContext ctx(*this, block); CompileTimeInferShapeContext ctx(*this, block);
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(10)) {
std::ostringstream sout; std::ostringstream sout;
...@@ -733,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { ...@@ -733,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
if (length == 0) { if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
"Input(%s) should have only one value, " "Input(%s) should have only one value, "
"but it have %d now", "but it has %d values now.",
name, length); name, length));
return block_.HasVarRecursive(input_names[0]); return block_.HasVarRecursive(input_names[0]);
} }
...@@ -749,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const { ...@@ -749,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
if (length == 0) { if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
"Output(%s) should have only one value, " "Output(%s) should have only one value, "
"but it have %d now", "but it has %d values now.",
name, length); name, length));
return block_.HasVarRecursive(output_names[0]); return block_.HasVarRecursive(output_names[0]);
} }
...@@ -801,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs( ...@@ -801,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs(
std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims( std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
const std::string &name) const { const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found.", name));
std::vector<DDim> res; std::vector<DDim> res;
try { try {
auto shapes = var->GetShapes(); auto shapes = var->GetShapes();
...@@ -823,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name, ...@@ -823,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
void CompileTimeInferShapeContext::SetRepeatedDims( void CompileTimeInferShapeContext::SetRepeatedDims(
const std::string &name, const std::vector<DDim> &dims) { const std::string &name, const std::vector<DDim> &dims) {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found.", name));
std::vector<std::vector<int64_t>> dim_vec(dims.size()); std::vector<std::vector<int64_t>> dim_vec(dims.size());
std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>); std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>);
var->SetShapes(dim_vec); var->SetShapes(dim_vec);
......
...@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \ customized_name, \
customized_type_value, \ customized_type_value, \
...@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/** /**
* Macro to mark what Operator and Kernel * Macro to mark what Operator and Kernel
* we will use and tell the compiler to * we will use and tell the compiler to
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <boost/any.hpp>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace compatible {
struct OpUpdateRecord {
enum class Type {
kInvalid = 0,
kModifyAttr,
kNewAttr,
kNewInput,
kNewOutput,
kBugfixWithBehaviorChanged,
};
Type type_;
std::string remark_;
};
struct ModifyAttr : OpUpdateRecord {
ModifyAttr(const std::string& name, const std::string& remark,
const boost::any& default_value)
: OpUpdateRecord({Type::kModifyAttr, remark}),
name_(name),
default_value_(default_value) {
// TODO(Shixiaowei02): Check the data type with proto::OpDesc.
}
private:
std::string name_;
boost::any default_value_;
};
struct NewAttr : OpUpdateRecord {
NewAttr(const std::string& name, const std::string& remark,
const boost::any& default_value)
: OpUpdateRecord({Type::kNewAttr, remark}),
name_(name),
default_value_(default_value) {}
private:
std::string name_;
boost::any default_value_;
};
struct NewInput : OpUpdateRecord {
NewInput(const std::string& name, const std::string& remark)
: OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
private:
std::string name_;
};
struct NewOutput : OpUpdateRecord {
NewOutput(const std::string& name, const std::string& remark)
: OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
private:
std::string name_;
};
struct BugfixWithBehaviorChanged : OpUpdateRecord {
explicit BugfixWithBehaviorChanged(const std::string& remark)
: OpUpdateRecord({Type::kBugfixWithBehaviorChanged, remark}) {}
};
class OpVersionDesc {
public:
OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
boost::any default_value) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::ModifyAttr(name, remark, default_value)));
return *this;
}
OpVersionDesc& NewAttr(const std::string& name, const std::string& remark,
boost::any default_value) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::NewAttr(name, remark, default_value)));
return *this;
}
OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::NewInput(name, remark)));
return *this;
}
OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::NewOutput(name, remark)));
return *this;
}
OpVersionDesc& BugfixWithBehaviorChanged(const std::string& remark) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::BugfixWithBehaviorChanged(remark)));
return *this;
}
private:
std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
};
class OpVersion {
public:
OpVersion& AddCheckpoint(const std::string& note,
const OpVersionDesc& op_version_desc) {
checkpoints_.push_back(Checkpoint({note, op_version_desc}));
return *this;
}
private:
struct Checkpoint {
std::string note_;
OpVersionDesc op_version_desc_;
};
std::vector<Checkpoint> checkpoints_;
};
class OpVersionRegistrar {
public:
static OpVersionRegistrar& GetInstance() {
static OpVersionRegistrar instance;
return instance;
}
OpVersion& Register(const std::string& op_type) {
if (op_version_map_.find(op_type) != op_version_map_.end()) {
PADDLE_THROW("'%s' is registered in operator version more than once.",
op_type);
}
op_version_map_.insert({op_type, OpVersion()});
return op_version_map_[op_type];
}
private:
std::unordered_map<std::string, OpVersion> op_version_map_;
OpVersionRegistrar() = default;
OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
};
} // namespace compatible
} // namespace framework
} // namespace paddle
#define REGISTER_OP_VERSION(op_type) \
static paddle::framework::compatible::OpVersion \
RegisterOpVersion__##op_type = \
paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
.Register(#op_type)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
namespace compatible {
TEST(test_operator_version, test_operator_version) {
REGISTER_OP_VERSION(test__)
.AddCheckpoint(
R"ROC(Fix the bug of reshape op, support the case of axis < 0)ROC",
framework::compatible::OpVersionDesc().BugfixWithBehaviorChanged(
"Support the case of axis < 0"))
.AddCheckpoint(
R"ROC(
Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
)ROC",
framework::compatible::OpVersionDesc()
.ModifyAttr("axis",
"Increased from the original one method to two.", -1)
.NewAttr("size",
"In order to represent a two-dimensional rectangle, the "
"parameter size is added.",
0))
.AddCheckpoint(
R"ROC(
Add a new attribute [height]
)ROC",
framework::compatible::OpVersionDesc().NewAttr(
"height",
"In order to represent a two-dimensional rectangle, the "
"parameter height is added.",
0))
.AddCheckpoint(
R"ROC(
Add a input [X2] and a output [Y2]
)ROC",
framework::compatible::OpVersionDesc()
.NewInput("X2", "The second input.")
.NewOutput("Y2", "The second output."));
}
} // namespace compatible
} // namespace framework
} // namespace paddle
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
...@@ -20,18 +22,21 @@ limitations under the License. */ ...@@ -20,18 +22,21 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_call_stack.h" #include "paddle/fluid/framework/op_call_stack.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/framework/unused_var_check.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -163,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -163,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else #else
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif
} else if (platform::is_xpu_place(place)) {
#ifndef PADDLE_WITH_XPU
PADDLE_THROW(platform::errors::Unimplemented(
"Cannot run operator on place %s", place));
#else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::SetXPUDeviceId(dev_id);
#endif #endif
} }
...@@ -604,6 +617,29 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -604,6 +617,29 @@ class RuntimeInferShapeContext : public InferShapeContext {
return op_.Outputs(name); return op_.Outputs(name);
} }
std::string GetInputNameByIdx(size_t idx) const override {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of inputs of "
"operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->inputs().size()));
return op_proto->inputs()[idx].name();
}
std::string GetOutputNameByIdx(size_t idx) const override {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT(
idx, op_proto->outputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of outputs of "
"operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->outputs().size()));
return op_proto->outputs()[idx].name();
}
void ShareDim(const std::string& in, const std::string& out, size_t i = 0, void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) override { size_t j = 0) override {
auto in_it = ctx_.inputs.find(in); auto in_it = ctx_.inputs.find(in);
...@@ -1084,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, ...@@ -1084,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing XPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif #endif
if (kernel_iter == kernels.end()) { if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", type_, PADDLE_THROW("op %s does not have kernel for %s", type_,
......
...@@ -64,9 +64,6 @@ constexpr char kZeroVarSuffix[] = "@ZERO"; ...@@ -64,9 +64,6 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
/// Variables with this suffix are the new Gradient. /// Variables with this suffix are the new Gradient.
constexpr char kNewGradSuffix[] = "@NEWGRAD@"; constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// Variables with this suffix are the loaded from pre-train model.
constexpr char kLoadedVarSuffix[] = "@LOADED";
/// RuntimeContext is used to relate input/output names of Operator with /// RuntimeContext is used to relate input/output names of Operator with
/// the corresponding variables in name scope. /// the corresponding variables in name scope.
/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same /// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
......
...@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy, const BuildStrategy &build_strategy,
ir::Graph *graph) ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) { : member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
platform::errors::Unavailable(
"XPU is not supported in ParallelExecutor"));
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size()); member_->places_.size());
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
......
...@@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, ...@@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
should_run.push_back(true); should_run.push_back(true);
} else { } else {
should_run.push_back(false); should_run.push_back(false);
// If the output of an op modifies feed vars, the op should not clip.
// For example, in the transformer structure, the third parameter returned
// by beam_search op is generally assigned to a feed var. Cutting the
// assign op will cause an error.
if (parent_block_id != -1) {
bool flag = false;
for (auto& var : op_desc.outputs()) {
for (auto& argu : var.arguments()) {
if (feed_var_names.count(argu)) {
flag = true;
}
}
}
if (flag) {
should_run.back() = true;
}
}
} }
} }
......
...@@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) { ...@@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) {
EXPECT_EQ(pruned.blocks(0).ops_size(), 2); EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
EXPECT_EQ(pruned.blocks(1).ops_size(), 1); EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
} }
// If the output of an op modifies feed vars, the op should not clip.
TEST(Prune, recurrrent_op_2) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::BlockDesc *sub_block = program.AppendBlock(*block);
AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
f::AttributeMap{}, block);
std::vector<std::string> state_var_name(1, "y");
AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}},
{{"ex_states", state_var_name},
{"states", state_var_name},
{"sub_block", sub_block}},
block);
EXPECT_TRUE(sub_block != nullptr);
AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"a"}}},
f::AttributeMap{}, sub_block);
f::proto::ProgramDesc *pdesc = program.Proto();
pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
f::proto::ProgramDesc pruned;
std::set<std::string> feed_var_names = {"x", "a"};
f::Prune(*pdesc, feed_var_names, &pruned);
EXPECT_EQ(pruned.blocks_size(), 2);
EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
}
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
...@@ -52,7 +53,8 @@ class InferShapeContext { ...@@ -52,7 +53,8 @@ class InferShapeContext {
const std::vector<DDim> &dims) = 0; const std::vector<DDim> &dims) = 0;
virtual void SetReaderDims(const std::string &name, virtual void SetReaderDims(const std::string &name,
const std::vector<DDim> &dims); const std::vector<DDim> &dims);
virtual std::string GetInputNameByIdx(size_t idx) const = 0;
virtual std::string GetOutputNameByIdx(size_t idx) const = 0;
virtual AttrReader Attrs() const = 0; virtual AttrReader Attrs() const = 0;
virtual std::vector<std::string> Inputs(const std::string &name) const = 0; virtual std::vector<std::string> Inputs(const std::string &name) const = 0;
virtual std::vector<std::string> Outputs(const std::string &name) const = 0; virtual std::vector<std::string> Outputs(const std::string &name) const = 0;
......
...@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx, const platform::DeviceContext& dev_ctx,
const size_t& seek, const std::vector<int64_t>& shape); const size_t& seek, const std::vector<int64_t>& shape);
// store the bool result tensor in out tensor
void TensorContainsNANV2(const framework::Tensor& tensor,
framework::Tensor* out);
void TensorContainsInfV2(const framework::Tensor& tensor,
framework::Tensor* out);
void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out);
// convert dlpack's DLTensor to tensor // convert dlpack's DLTensor to tensor
void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst); void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);
......
...@@ -2,10 +2,10 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) ...@@ -2,10 +2,10 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
add_subdirectory(jit) add_subdirectory(jit)
cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc)
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/variable_wrapper.h"
namespace paddle {
namespace imperative {
AmpOperators::AmpOperators()
: allow_ops_(new std::unordered_set<std::string>()),
block_ops_(new std::unordered_set<std::string>()) {}
AmpOperators::~AmpOperators() {}
AmpOperators& AmpOperators::Instance() {
static AmpOperators instance;
return instance;
}
std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetAllowOps() {
return allow_ops_;
}
std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetBlockOps() {
return block_ops_;
}
inline std::string GetDtypeStr(
const std::shared_ptr<imperative::VarBase>& var) {
return framework::DataTypeToString(var->DataType());
}
inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
if (!platform::is_gpu_place(var->Place())) {
return false;
}
if (var->DataType() == framework::proto::VarType::FP32 ||
var->DataType() == framework::proto::VarType::FP16) {
return true;
} else {
return false;
}
}
// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
// var will be cast back from fp16 to fp32 during backward phase.
static inline std::shared_ptr<imperative::VarBase> CastToType(
const std::shared_ptr<VarBase>& var,
const framework::proto::VarType::Type dst_type) {
const auto& tracer = imperative::GetCurrentTracer();
imperative::NameVarBaseMap ins = {{"X", {var}}};
framework::AttributeMap attrs = {{"in_dtype", var->DataType()},
{"out_dtype", dst_type}};
auto out = std::shared_ptr<imperative::VarBase>(
new imperative::VarBase(tracer->GenerateUniqueName()));
imperative::NameVarBaseMap outs = {{"Out", {out}}};
{
AutoCastGuard guard(tracer, false);
tracer->TraceOp("cast", ins, outs, std::move(attrs));
}
return out;
}
static inline std::shared_ptr<imperative::VarBase> CastToFP16(
const std::shared_ptr<VarBase>& var) {
auto dst_type = framework::proto::VarType::FP16;
if (NeedCast(var) && (var->DataType() != dst_type)) {
return CastToType(var, dst_type);
}
return var;
}
static inline std::shared_ptr<imperative::VarBase> CastToFP32(
const std::shared_ptr<VarBase>& var) {
auto dst_type = framework::proto::VarType::FP32;
if (NeedCast(var) && (var->DataType() != dst_type)) {
return CastToType(var, dst_type);
}
return var;
}
static inline framework::proto::VarType::Type GetPromoteType(
const NameVarBaseMap& ins) {
auto dst_type = framework::proto::VarType::FP16;
for (const auto& pair : ins) {
for (const auto& var : pair.second) {
if (var->DataType() == framework::proto::VarType::FP32) {
dst_type = var->DataType();
break;
}
}
}
return dst_type;
}
NameVarBaseMap AutoCastInputs(const std::string& op_type,
const NameVarBaseMap& ins) {
NameVarBaseMap new_ins = {};
if (AmpOperators::Instance().GetAllowOps()->count(op_type)) {
for (const auto& pair : ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float16";
for (const auto& var : pair.second) {
auto new_var = CastToFP16(var);
new_ins[pair.first].emplace_back(new_var);
}
}
return new_ins;
} else if (AmpOperators::Instance().GetBlockOps()->count(op_type)) {
for (const auto& pair : ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float";
for (const auto& var : pair.second) {
auto new_var = CastToFP32(var);
new_ins[pair.first].emplace_back(new_var);
}
}
return new_ins;
} else {
auto dst_type = GetPromoteType(ins);
for (const auto& pair : ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< framework::DataTypeToString(dst_type);
for (const auto& var : pair.second) {
// NOTE(zhiqiu): Conv + BN always occur together, we needn't
// cast X of batch_norm to FP32, which is produced by conv as FP16 type.
if (op_type == "batch_norm" && pair.first == "X" &&
dst_type == framework::proto::VarType::FP32) {
new_ins[pair.first].emplace_back(var);
continue;
}
auto new_var = dst_type == framework::proto::VarType::FP32
? CastToFP32(var)
: CastToFP16(var);
new_ins[pair.first].emplace_back(new_var);
}
}
return new_ins;
}
return ins;
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <unordered_set>
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle {
namespace imperative {
// Singleton implementation with C++ 11
class AmpOperators {
public:
~AmpOperators();
AmpOperators(const AmpOperators& o) = delete;
const AmpOperators& operator=(const AmpOperators& o) = delete;
static AmpOperators& Instance();
std::shared_ptr<std::unordered_set<std::string>> GetAllowOps();
std::shared_ptr<std::unordered_set<std::string>> GetBlockOps();
private:
AmpOperators(); // forbid calling default constructor
// The set of ops that support fp16 calculation and are considered numerically
// safe and performance critical. These ops are always converted to fp16.
std::shared_ptr<std::unordered_set<std::string>> allow_ops_;
// The set of ops that support fp16 calculation and are considered numerically
// dangerous and whose effects may also be observed in downstream ops.
std::shared_ptr<std::unordered_set<std::string>> block_ops_;
};
// NOTE(zhiqiu): AutoCastGuard is used for RAII.
class AutoCastGuard {
public:
AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode)
: tracer_(tracer) {
pre_mode_ = tracer_->IsAutoCastEnabled();
if (pre_mode_ != guard_mode) {
tracer_->SetEnableAutoCast(guard_mode);
}
}
~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); }
// forbid copy and operator=
AutoCastGuard(const AutoCastGuard& guard) = delete;
AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;
private:
std::shared_ptr<Tracer> tracer_;
bool pre_mode_;
};
NameVarBaseMap AutoCastInputs(const std::string& op_type,
const NameVarBaseMap& ins);
} // namespace imperative
} // namespace paddle
...@@ -30,12 +30,13 @@ ...@@ -30,12 +30,13 @@
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(sort_sum_gradient);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy, void BasicEngine::Init(VarBase* var, bool retain_graph) {
bool retain_graph) { sorted_sum_gradient_ = FLAGS_sort_sum_gradient;
backward_strategy_ = strategy;
retain_graph_ = retain_graph; retain_graph_ = retain_graph;
init_node_ = var->GradVarBase()->GradNode(); init_node_ = var->GradVarBase()->GradNode();
var->GradVarBase()->ClearGradNode(); var->GradVarBase()->ClearGradNode();
...@@ -105,7 +106,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { ...@@ -105,7 +106,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
auto& accumulator = accumulators_[var.get()]; auto& accumulator = accumulators_[var.get()];
if (!accumulator) { if (!accumulator) {
if (backward_strategy_.sorted_sum_gradient_) { if (sorted_sum_gradient_) {
accumulator.reset(new SortedGradientAccumulator(var.get())); accumulator.reset(new SortedGradientAccumulator(var.get()));
} else { } else {
accumulator.reset(new EagerGradientAccumulator(var.get())); accumulator.reset(new EagerGradientAccumulator(var.get()));
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/imperative/gradient_accumulator.h"
...@@ -30,8 +29,7 @@ class OpBase; ...@@ -30,8 +29,7 @@ class OpBase;
class BasicEngine : public Engine { class BasicEngine : public Engine {
public: public:
void Init(VarBase* var, const detail::BackwardStrategy& strategy, void Init(VarBase* var, bool retain_graph = false);
bool retain_graph = false);
void Execute() override; void Execute() override;
...@@ -46,7 +44,7 @@ class BasicEngine : public Engine { ...@@ -46,7 +44,7 @@ class BasicEngine : public Engine {
private: private:
std::shared_ptr<GradOpNode> init_node_; std::shared_ptr<GradOpNode> init_node_;
detail::BackwardStrategy backward_strategy_; bool sorted_sum_gradient_;
std::unordered_map<GradOpNode*, size_t> node_deps_; std::unordered_map<GradOpNode*, size_t> node_deps_;
std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>> std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
accumulators_; accumulators_;
......
...@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
blas.AXPY(numel_, 1., x_, y_); blas.AXPY(numel_, 1., x_, y_);
} }
void operator()(const platform::XPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) { void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx = platform::CUDADeviceContext* ctx =
......
...@@ -16,7 +16,9 @@ ...@@ -16,7 +16,9 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
...@@ -32,8 +34,12 @@ class DygraphInferShapeContext : public framework::InferShapeContext { ...@@ -32,8 +34,12 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
public: public:
DygraphInferShapeContext(const NameVarMap<VarType>* in, DygraphInferShapeContext(const NameVarMap<VarType>* in,
const NameVarMap<VarType>* out, const NameVarMap<VarType>* out,
const framework::AttributeMap* attr) const framework::AttributeMap* attr,
: var_base_map_in_(in), var_base_map_out_(out), attrs_(attr) {} const std::string op_type)
: var_base_map_in_(in),
var_base_map_out_(out),
attrs_(attr),
op_type_(op_type) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
// has only one input // has only one input
...@@ -135,6 +141,28 @@ class DygraphInferShapeContext : public framework::InferShapeContext { ...@@ -135,6 +141,28 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
return vec_res; return vec_res;
} }
std::string GetInputNameByIdx(size_t idx) const override {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of inputs of "
"operator %s, but got index is %d and size is %d",
op_type_, idx, op_proto->inputs().size()));
return op_proto->inputs()[idx].name();
}
std::string GetOutputNameByIdx(size_t idx) const override {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
PADDLE_ENFORCE_LT(
idx, op_proto->outputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of outputs of "
"operator %s, but got index is %d and size is %d",
op_type_, idx, op_proto->outputs().size()));
return op_proto->outputs()[idx].name();
}
void ShareDim(const std::string& in, const std::string& out, size_t i = 0, void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) override { size_t j = 0) override {
...@@ -367,6 +395,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext { ...@@ -367,6 +395,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
const NameVarMap<VarType>* var_base_map_in_; const NameVarMap<VarType>* var_base_map_in_;
const NameVarMap<VarType>* var_base_map_out_; const NameVarMap<VarType>* var_base_map_out_;
const framework::AttributeMap* attrs_; const framework::AttributeMap* attrs_;
const std::string op_type_;
}; };
} // namespace imperative } // namespace imperative
......
...@@ -186,6 +186,8 @@ class VarBase { ...@@ -186,6 +186,8 @@ class VarBase {
framework::proto::VarType::Type DataType() const { return var_->DataType(); } framework::proto::VarType::Type DataType() const { return var_->DataType(); }
const platform::Place Place() const { return var_->Place(); }
void ClearGradient(); void ClearGradient();
std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place, std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
......
...@@ -33,6 +33,8 @@ ...@@ -33,6 +33,8 @@
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
DECLARE_bool(sort_sum_gradient);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -529,8 +531,7 @@ class PartialGradTask { ...@@ -529,8 +531,7 @@ class PartialGradTask {
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const platform::Place &place, bool create_graph,
const detail::BackwardStrategy &strategy, bool create_graph,
bool retain_graph, bool allow_unused, bool only_inputs); bool retain_graph, bool allow_unused, bool only_inputs);
std::vector<std::shared_ptr<VarBase>> Run(); std::vector<std::shared_ptr<VarBase>> Run();
...@@ -577,7 +578,7 @@ class PartialGradTask { ...@@ -577,7 +578,7 @@ class PartialGradTask {
bool retain_graph_; bool retain_graph_;
bool allow_unused_; bool allow_unused_;
bool only_inputs_; bool only_inputs_;
detail::BackwardStrategy strategy_; bool sorted_sum_gradient_{FLAGS_sort_sum_gradient};
}; };
PartialGradTask::PartialGradTask( PartialGradTask::PartialGradTask(
...@@ -585,15 +586,14 @@ PartialGradTask::PartialGradTask( ...@@ -585,15 +586,14 @@ PartialGradTask::PartialGradTask(
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const detail::BackwardStrategy &strategy, const platform::Place &place, bool create_graph, bool retain_graph,
bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) { bool allow_unused, bool only_inputs) {
input_targets_ = input_targets; input_targets_ = input_targets;
place_ = place; place_ = place;
create_graph_ = create_graph; create_graph_ = create_graph;
retain_graph_ = retain_graph; retain_graph_ = retain_graph;
allow_unused_ = allow_unused; allow_unused_ = allow_unused;
only_inputs_ = only_inputs; only_inputs_ = only_inputs;
strategy_ = strategy;
PADDLE_ENFORCE_EQ(only_inputs_, true, PADDLE_ENFORCE_EQ(only_inputs_, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
...@@ -887,7 +887,10 @@ void PartialGradTask::RunEachOp(OpBase *op) { ...@@ -887,7 +887,10 @@ void PartialGradTask::RunEachOp(OpBase *op) {
op->Attrs(), op->place()); op->Attrs(), op->place());
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
double_grad_node, double_grad_node,
platform::errors::NotFound("The Op %s doesn't have any grad op.", platform::errors::NotFound("The Op %s doesn't have any grad op. If you "
"don't intend calculating higher order "
"derivatives, please set `create_graph` to "
"False.",
op->Type())); op->Type()));
VLOG(10) << "Create " << double_grad_node->size() VLOG(10) << "Create " << double_grad_node->size()
<< " double grad op(s) for " << op->Type() << " double grad op(s) for " << op->Type()
...@@ -978,7 +981,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) { ...@@ -978,7 +981,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
if (!accumulator) { if (!accumulator) {
accumulator.reset(new GradientAccumulationInfo( accumulator.reset(new GradientAccumulationInfo(
var, strategy_.sorted_sum_gradient_, create_graph_)); var, sorted_sum_gradient_, create_graph_));
} }
accumulator->IncreaseTotalRefCnt(); accumulator->IncreaseTotalRefCnt();
...@@ -1030,11 +1033,11 @@ PartialGradEngine::PartialGradEngine( ...@@ -1030,11 +1033,11 @@ PartialGradEngine::PartialGradEngine(
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const detail::BackwardStrategy &strategy, const platform::Place &place, bool create_graph, bool retain_graph,
bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) bool allow_unused, bool only_inputs)
: task_(new PartialGradTask(input_targets, output_targets, output_grads, : task_(new PartialGradTask(input_targets, output_targets, output_grads,
no_grad_vars, place, strategy, create_graph, no_grad_vars, place, create_graph, retain_graph,
retain_graph, allow_unused, only_inputs)) {} allow_unused, only_inputs)) {}
PartialGradEngine::~PartialGradEngine() { Clear(); } PartialGradEngine::~PartialGradEngine() { Clear(); }
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine { ...@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine {
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const platform::Place &place, bool create_graph,
const detail::BackwardStrategy &strategy, bool create_graph,
bool retain_graph, bool allow_unused, bool only_inputs); bool retain_graph, bool allow_unused, bool only_inputs);
~PartialGradEngine(); ~PartialGradEngine();
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/prepared_operator.h"
#include <sstream> #include <sstream>
#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/execution_context.h"
#include "paddle/fluid/imperative/infer_shape_context.h" #include "paddle/fluid/imperative/infer_shape_context.h"
#include "paddle/fluid/imperative/infer_var_type_context.h" #include "paddle/fluid/imperative/infer_var_type_context.h"
...@@ -40,23 +42,17 @@ static void PrepareData(const platform::Place& place, ...@@ -40,23 +42,17 @@ static void PrepareData(const platform::Place& place,
for (const auto& var_base : name_pair.second) { for (const auto& var_base : name_pair.second) {
const auto* tensor = GetTensorFromVar(var_base->Var()); const auto* tensor = GetTensorFromVar(var_base->Var());
if (tensor && tensor->IsInitialized()) { if (tensor && tensor->IsInitialized()) {
auto tmp_place = tensor->place(); auto kernel_type_for_var = op.GetKernelTypeForVar(
name_pair.first, *tensor, expected_kernel_key);
// TODO(jiabin): Support transform data layout when we Verify it on more if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
// tests continue;
if (!(tmp_place == place)) { } else {
auto kernel_type_for_var = op.GetKernelTypeForVar( VLOG(3) << "Transform Variable " << var_base->Name() << " from "
name_pair.first, *tensor, expected_kernel_key); << kernel_type_for_var << " to " << expected_kernel_key;
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { framework::Tensor out;
continue; TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
} else { &out);
VLOG(3) << "Transform Variable " << var_base->Name() << " from " SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
<< kernel_type_for_var << " to " << expected_kernel_key;
framework::Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
&out);
SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
}
} }
} }
} }
...@@ -91,12 +87,26 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins, ...@@ -91,12 +87,26 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second; auto& kernels = kernels_iter->second;
framework::RuntimeContext ctx({}, {}); framework::RuntimeContext ctx({}, {});
#ifdef PADDLE_WITH_MKLDNN
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
#endif
auto expected_kernel_key = auto expected_kernel_key =
op.GetExpectedKernelType(DygraphExecutionContext<VarType>( op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs)); op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that case // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound( platform::errors::NotFound(
...@@ -137,7 +147,8 @@ static void PreparedOpRunImpl( ...@@ -137,7 +147,8 @@ static void PreparedOpRunImpl(
// TODO(zjl): remove scope in dygraph // TODO(zjl): remove scope in dygraph
framework::Scope scope; framework::Scope scope;
DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs); DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
op.Type());
static_cast<const framework::OperatorWithKernel&>(op).InferShape( static_cast<const framework::OperatorWithKernel&>(op).InferShape(
&infer_shape_ctx); &infer_shape_ctx);
......
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
// //
#include <paddle/fluid/framework/op_registry.h> #include <paddle/fluid/framework/op_registry.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/execution_context.h"
#include "paddle/fluid/imperative/infer_shape_context.h" #include "paddle/fluid/imperative/infer_shape_context.h"
...@@ -384,7 +386,7 @@ TEST(test_layer, test_dygraph_infershape_context) { ...@@ -384,7 +386,7 @@ TEST(test_layer, test_dygraph_infershape_context) {
concat_att_map["axis"] = 1; concat_att_map["axis"] = 1;
DygraphInferShapeContext<imperative::VarBase> infer_shape_ctx( DygraphInferShapeContext<imperative::VarBase> infer_shape_ctx(
&ins, &outs, &concat_att_map); &ins, &outs, &concat_att_map, "dummy");
bool have_x = infer_shape_ctx.HasOutputs("Out"); bool have_x = infer_shape_ctx.HasOutputs("Out");
ASSERT_EQ(have_x, true); ASSERT_EQ(have_x, true);
......
...@@ -176,7 +176,7 @@ TEST(test_prepare_op, test_prepare_data) { ...@@ -176,7 +176,7 @@ TEST(test_prepare_op, test_prepare_data) {
} }
#endif #endif
TEST(test_prepare_op, test_prepare_data_same_place) { void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
std::shared_ptr<imperative::VarBase> vin( std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin")); new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout( std::shared_ptr<imperative::VarBase> vout(
...@@ -198,7 +198,6 @@ TEST(test_prepare_op, test_prepare_data_same_place) { ...@@ -198,7 +198,6 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
var_pair out_pair = var_pair("Out", vb_vector(1, vout)); var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair}; imperative::NameVarBaseMap ins = {x_pair};
imperative::NameVarBaseMap outs = {out_pair}; imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap attr_map;
const std::string op_type = "relu"; const std::string op_type = "relu";
const auto& info = framework::OpInfoMap::Instance().Get(op_type); const auto& info = framework::OpInfoMap::Instance().Get(op_type);
if (info.Checker()) info.Checker()->Check(&attr_map); if (info.Checker()) info.Checker()->Check(&attr_map);
...@@ -222,8 +221,21 @@ TEST(test_prepare_op, test_prepare_data_same_place) { ...@@ -222,8 +221,21 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
} }
} }
} }
TEST(test_prepare_op, test_prepare_data_same_place) {
TestPrepareDataSamePlace({});
}
#ifdef PADDLE_WITH_MKLDNN
TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
TestPrepareDataSamePlace({{"use_mkldnn", true}});
}
#endif
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(split); USE_OP(split);
USE_OP(relu); USE_OP(relu);
#ifdef PADDLE_WITH_MKLDNN
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
#endif
...@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { ...@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
framework::AttributeMap reduce_attr_map; framework::AttributeMap reduce_attr_map;
tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
gpu_place, true); gpu_place, true);
detail::BackwardStrategy back_st;
imperative::BasicEngine engine; imperative::BasicEngine engine;
engine.Init(reduce_sum_out.get(), back_st); engine.Init(reduce_sum_out.get());
engine.Execute(); engine.Execute();
framework::LoDTensor rlt; framework::LoDTensor rlt;
...@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) { ...@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) {
ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
detail::BackwardStrategy back_st;
imperative::BasicEngine engine; imperative::BasicEngine engine;
engine.Init(vout.get(), back_st); engine.Init(vout.get());
engine.Execute(); engine.Execute();
// check the grad // check the grad
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/imperative/op_base.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
...@@ -53,8 +54,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -53,8 +54,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
attr_checker->Check(&attrs, true); attr_checker->Check(&attrs, true);
} }
NameVarBaseMap new_ins = ins;
if (enable_autocast_) {
VLOG(5) << "Auto mixed precision run operator: " << type;
new_ins = AutoCastInputs(type, ins);
}
try { try {
OpBase::Run(*op, ins, outs, attrs, place); OpBase::Run(*op, new_ins, outs, attrs, place);
} catch (platform::EnforceNotMet& exception) { } catch (platform::EnforceNotMet& exception) {
framework::AppendErrorOpHint(type, &exception); framework::AppendErrorOpHint(type, &exception);
throw std::move(exception); throw std::move(exception);
...@@ -73,11 +80,11 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -73,11 +80,11 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
if (enable_program_desc_tracing_) { if (enable_program_desc_tracing_) {
VLOG(5) << "Trace op " << type << " into ProgramDesc"; VLOG(5) << "Trace op " << type << " into ProgramDesc";
program_desc_tracer_->InsertOp(type, ins, outs, attrs); program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
} }
if (ComputeRequiredGrad(ins, outs, trace_backward)) { if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
CreateGradOpNode(*op, ins, outs, attrs, place); CreateGradOpNode(*op, new_ins, outs, attrs, place);
} else { } else {
VLOG(3) << "No Grad to track for Op: " << type; VLOG(3) << "No Grad to track for Op: " << type;
} }
......
...@@ -97,6 +97,10 @@ class Tracer { ...@@ -97,6 +97,10 @@ class Tracer {
void SetHasGrad(bool has_grad) { has_grad_ = has_grad; } void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }
void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; }
bool IsAutoCastEnabled() const { return enable_autocast_; }
private: private:
std::unique_ptr<BasicEngine> basic_engine_; std::unique_ptr<BasicEngine> basic_engine_;
std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_; std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
...@@ -104,6 +108,7 @@ class Tracer { ...@@ -104,6 +108,7 @@ class Tracer {
std::unique_ptr<UniqueNameGenerator> generator_; std::unique_ptr<UniqueNameGenerator> generator_;
platform::Place expected_place_; platform::Place expected_place_;
bool has_grad_{true}; bool has_grad_{true};
bool enable_autocast_{false};
}; };
// To access static variable current_tracer // To access static variable current_tracer
......
...@@ -111,6 +111,28 @@ class VariableWrapper { ...@@ -111,6 +111,28 @@ class VariableWrapper {
} }
} }
const platform::Place Place() const {
const framework::Tensor* tensor = nullptr;
auto place =
platform::CPUPlace(); // Default place for var not initialized.
if (var_.IsInitialized()) {
if (type_ == framework::proto::VarType::LOD_TENSOR) {
tensor = &(var_.Get<framework::LoDTensor>());
} else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
tensor = &(var_.Get<framework::SelectedRows>().value());
} else {
VLOG(6) << "Variable " << name_ << " is not initialized";
return place;
}
}
if (tensor && tensor->IsInitialized()) {
return tensor->place();
} else {
VLOG(6) << "The tensor of variable " << name_ << " is not initialized";
return place;
}
}
private: private:
void SetGradVar(const std::shared_ptr<VariableWrapper>& var) { void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
auto shared_var = grad_var_.lock(); auto shared_var = grad_var_.lock();
......
...@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32) ...@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32)
SRCS analyzer_tester.cc SRCS analyzer_tester.cc
EXTRA_DEPS reset_tensor_array paddle_fluid_shared EXTRA_DEPS reset_tensor_array paddle_fluid_shared
ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
elseif(NOT WIN32) elseif(WIN32)
# TODO: Fix this unittest failed on Windows inference_analysis_test(test_analyzer
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api
EXTRA_DEPS reset_tensor_array paddle_inference_api ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
endif() endif()
...@@ -6,13 +6,13 @@ and make the various optimization features be pluggable and co-exist in a pipeli ...@@ -6,13 +6,13 @@ and make the various optimization features be pluggable and co-exist in a pipeli
We borrowed some concepts from LLVM, such as We borrowed some concepts from LLVM, such as
- [Pass](./pass.h)es to implement optimization that traverse the inference program, - [Pass](../../framework/ir/pass.h)es to implement optimization that traverse the inference program,
- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program, - [Graph](../../framework/ir/graph.h) to represent the data flow graph built from a program,
- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph. - [PassManager](./ir_pass_manager.h) to manage a sequence of `Pass`es over a graph.
There are some other basic concepts here There are some other basic concepts here
- [Node](./node.h), the node in a `DataFlowGraph`, - [Node](../../framework/ir/node.h), the node in a `Graph`,
- `Function`, the Operator in Fluid, - `Function`, the Operator in Fluid,
- `Value`, the Variable in Fluid; - `Value`, the Variable in Fluid;
- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline, - [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
...@@ -21,9 +21,9 @@ There are some other basic concepts here ...@@ -21,9 +21,9 @@ There are some other basic concepts here
The `inference/analysis` module make all the passes in a pipeline, and works in such way: The `inference/analysis` module make all the passes in a pipeline, and works in such way:
1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc, 1. Build a `Graph` from a Fluid inference ProgramDesc,
2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes, 2. Call the middle passes one by one, the same `Graph` is passed across all the passes,
3. Transform a new ProgramDesc from the modified `DataFlowGraph`. 3. Transform a new ProgramDesc from the modified `Graph`.
The new optimization features can be added as an independent `Pass` and controlled by gflags, The new optimization features can be added as an independent `Pass` and controlled by gflags,
each pass will generate unified debug information or visualization for better debugging. each pass will generate unified debug information or visualization for better debugging.
...@@ -54,5 +54,5 @@ It can be used as a helper class that draws the modified graph after each pass. ...@@ -54,5 +54,5 @@ It can be used as a helper class that draws the modified graph after each pass.
There is some helper legacy/function/class for analysis. There is some helper legacy/function/class for analysis.
- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes, - [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes, - [graph_traits.h](../../framework/ir/graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
there are some implementations in [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS.. there are some implementations in [graph_helper.cc](../../framework/ir/graph_helper.cc) , such as BFS and DFS..
...@@ -54,8 +54,7 @@ if(WITH_TESTING) ...@@ -54,8 +54,7 @@ if(WITH_TESTING)
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST") set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
elseif(NOT WIN32) elseif(WIN32)
# TODO: Fix this unittest failed on Windows
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
...@@ -67,8 +66,7 @@ endif() ...@@ -67,8 +66,7 @@ endif()
if (NOT APPLE AND NOT WIN32) if (NOT APPLE AND NOT WIN32)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared
ARGS --dirname=${WORD2VEC_MODEL_DIR}) ARGS --dirname=${WORD2VEC_MODEL_DIR})
elseif (NOT WIN32) elseif (WIN32)
# TODO: Fix this unittest failed on Windows
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR}) ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif() endif()
...@@ -218,6 +218,17 @@ void AnalysisConfig::EnableMkldnnQuantizer() { ...@@ -218,6 +218,17 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
Update(); Update();
} }
void AnalysisConfig::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
use_mkldnn_bfloat16_ = false;
#endif
Update();
}
MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
"MkldnnQuantizer was not enabled yet."); "MkldnnQuantizer was not enabled yet.");
...@@ -331,6 +342,12 @@ void AnalysisConfig::Update() { ...@@ -331,6 +342,12 @@ void AnalysisConfig::Update() {
#endif #endif
} }
if (use_mkldnn_bfloat16_) {
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMkldnnBfloat16();
#endif
}
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Do not optimize when mkldnn is on // Do not optimize when mkldnn is on
if (enable_memory_optim_ && !use_mkldnn_) { if (enable_memory_optim_ && !use_mkldnn_) {
...@@ -399,6 +416,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -399,6 +416,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << ";"; ss << ";";
ss << use_mkldnn_quantizer_; ss << use_mkldnn_quantizer_;
ss << use_mkldnn_bfloat16_;
ss << model_from_memory_; ss << model_from_memory_;
ss << with_profile_; ss << with_profile_;
......
...@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { ...@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
config.EnableUseGpu(100, 0);
config.EnableMkldnnBfloat16();
#ifdef PADDLE_WITH_MKLDNN
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
#else
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
#endif
}
#endif
TEST(AnalysisPredictor, bf16_pass_strategy) {
std::vector<std::string> passes;
PassStrategy passStrategy(passes);
passStrategy.EnableMkldnnBfloat16();
}
} // namespace paddle } // namespace paddle
...@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
void EnableMkldnnQuantizer(); void EnableMkldnnQuantizer();
///
/// \brief Turn on MKLDNN bfloat16.
///
///
void EnableMkldnnBfloat16();
///
/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
///
/// \return bool Whether to use the MKLDNN Bfloat16.
///
bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
/// ///
/// \brief A boolean state telling whether the thread local CUDA stream is /// \brief A boolean state telling whether the thread local CUDA stream is
/// enabled. /// enabled.
...@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig {
int mkldnn_cache_capacity_{0}; int mkldnn_cache_capacity_{0};
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_; std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
bool use_mkldnn_bfloat16_{false};
// If the config is already used on a predictor, it becomes invalid. // If the config is already used on a predictor, it becomes invalid.
// Any config can only be used with one predictor. // Any config can only be used with one predictor.
......
...@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() { ...@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
LOG(ERROR) << "GPU not support MKL-DNN quantization"; LOG(ERROR) << "GPU not support MKL-DNN quantization";
} }
void GpuPassStrategy::EnableMkldnnBfloat16() {
LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
}
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will // NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones. // not be damaged by smaller ones.
...@@ -223,4 +227,12 @@ void CpuPassStrategy::EnableMkldnnQuantizer() { ...@@ -223,4 +227,12 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
#endif #endif
} }
void CpuPassStrategy::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
use_mkldnn_bfloat16_ = false;
#endif
}
} // namespace paddle } // namespace paddle
...@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \brief Enable MKLDNN quantize optimization. /// \brief Enable MKLDNN quantize optimization.
virtual void EnableMkldnnQuantizer() {} virtual void EnableMkldnnQuantizer() {}
/// \brief Enable MKLDNN bfloat16.
virtual void EnableMkldnnBfloat16() {}
/// \brief Check if we are using gpu. /// \brief Check if we are using gpu.
/// \return A bool variable implying whether we are in gpu mode. /// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
...@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { ...@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
use_gpu_ = other.use_gpu_; use_gpu_ = other.use_gpu_;
use_mkldnn_ = other.use_mkldnn_; use_mkldnn_ = other.use_mkldnn_;
use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_; use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
} }
/// \brief Default destructor. /// \brief Default destructor.
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
...@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { ...@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
/// \brief Enable MKLDNN quantize optimization. /// \brief Enable MKLDNN quantize optimization.
void EnableMkldnnQuantizer() override; void EnableMkldnnQuantizer() override;
/// \brief Enable MKLDNN bfloat16.
void EnableMkldnnBfloat16() override;
protected: protected:
/// \cond Protected /// \cond Protected
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
bool use_mkldnn_bfloat16_{false};
/// \endcond /// \endcond
}; };
...@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \brief Not supported in GPU mode yet. /// \brief Not supported in GPU mode yet.
void EnableMkldnnQuantizer() override; void EnableMkldnnQuantizer() override;
/// \brief Not supported in GPU mode yet.
void EnableMkldnnBfloat16() override;
/// \brief Default destructor. /// \brief Default destructor.
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
......
...@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer( ...@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled( PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
const PD_AnalysisConfig* config); const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config, PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
const char* prog_buffer, const char* prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
......
...@@ -207,6 +207,18 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) { ...@@ -207,6 +207,18 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
return config->config.mkldnn_quantizer_enabled(); return config->config.mkldnn_quantizer_enabled();
} }
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
config->config.EnableMkldnnBfloat16();
}
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
return config->config.mkldnn_bfloat16_enabled();
}
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer, void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer, size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) { size_t params_buffer_size) {
......
...@@ -51,7 +51,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -51,7 +51,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
if (enable_int8) { if (enable_int8) {
#if IS_TRT_VERSION_GE(5000) #if IS_TRT_VERSION_GE(5000)
CHECK(op_desc.HasAttr("Input_scale")); if (op_desc.Type() != "conv2d_transpose") {
PADDLE_ENFORCE_EQ(
op_desc.HasAttr("Input_scale"), true,
platform::errors::InvalidArgument("Input scale not found. TRT int8"
" requires conv/deconv to have "
"input quantization scales."));
}
float in_scale = float in_scale =
BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
auto weight_scale = auto weight_scale =
......
...@@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input, ...@@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
} else if (shape.size() == 3UL) { } else if (shape.size() == 3UL) {
return nvinfer1::Dims3(shape[0], shape[1], shape[2]); return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
} }
return nvinfer1::Dims4(shape[0], shape[1], 1, 1); nvinfer1::Dims dims;
dims.nbDims = shape.size();
for (size_t i = 0; i < shape.size(); i++) {
dims.d[i] = shape[i];
}
return dims;
} }
} }
} // NOLINT } // NOLINT
......
...@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller {
#if IS_TRT_VERSION_GE(5130) #if IS_TRT_VERSION_GE(5130)
teller_set.insert("relu6"); teller_set.insert("relu6");
teller_set.insert("hard_sigmoid"); teller_set.insert("hard_sigmoid");
int8_teller_set.insert("relu6");
int8_teller_set.insert("hard_sigmoid");
#endif #endif
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
teller_set.insert("fused_embedding_eltwise_layernorm"); teller_set.insert("fused_embedding_eltwise_layernorm");
...@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller {
"elementwise_add", "elementwise_add",
"leaky_relu", "leaky_relu",
"fc", "fc",
"relu6",
"concat", "concat",
"scale", "scale",
"elementwise_mul", "elementwise_mul",
"conv2d_transpose"}; "conv2d_transpose",
"hard_swish"};
std::unordered_set<std::string> teller_set{ std::unordered_set<std::string> teller_set{
"matmul", "matmul",
"conv2d", "conv2d",
......
...@@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( ...@@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
return ret; return ret;
} }
template <typename T>
void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
for (auto ptr : embs_gpu_) {
if (ptr) cudaFree(ptr);
}
if (bias_gpu_) cudaFree(bias_gpu_);
if (scale_gpu_) cudaFree(scale_gpu_);
}
template <typename T> template <typename T>
bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs, int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
...@@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue( ...@@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
int64_t *emb_ptr_gpu_d = int64_t *emb_ptr_gpu_d =
emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id)); emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
std::vector<int64_t> in_ptr, emb_ptr; std::vector<uintptr_t> in_ptr, emb_ptr;
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i])); in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i])); emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
......
...@@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new EmbEltwiseLayernormPluginDynamic( auto ptr = new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
eps_); eps_);
ptr->embs_gpu_ = embs_gpu_;
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr;
} }
const char* getPluginType() const override { const char* getPluginType() const override {
...@@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
return sum_num; return sum_num;
} }
void terminate() override;
void serialize(void* buffer) const override { void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_); // SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_); SerializeValue(&buffer, emb_sizes_);
......
...@@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
void PReluPluginDynamic::terminate() {
if (p_gpu_weight_) {
cudaFree(p_gpu_weight_);
}
}
int PReluPluginDynamic::initialize() { int PReluPluginDynamic::initialize() {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
......
...@@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT { ...@@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
} }
~PReluPluginDynamic() { cudaFree(p_gpu_weight_); } ~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new PReluPluginDynamic(weight_.data(), weight_.size(), mode_); auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
ptr->p_gpu_weight_ = p_gpu_weight_;
return ptr;
} }
const char* getPluginType() const override { return "prelu_plugin"; } const char* getPluginType() const override { return "prelu_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
void terminate() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override;
void serialize(void* buffer) const override; void serialize(void* buffer) const override;
......
...@@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SkipLayerNormPluginDynamic( auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_); bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = bias_gpu_;
return ptr;
} }
const char* getPluginType() const override { return "skip_layernorm_plugin"; } const char* getPluginType() const override { return "skip_layernorm_plugin"; }
......
...@@ -20,6 +20,12 @@ function(download_int8_data install_dir data_file) ...@@ -20,6 +20,12 @@ function(download_int8_data install_dir data_file)
endif() endif()
endfunction() endfunction()
function(download_GRU_data install_dir data_file)
if (NOT EXISTS ${install_dir}/${data_file})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file})
endif()
endfunction()
function(download_quant_data install_dir data_file) function(download_quant_data install_dir data_file)
if (NOT EXISTS ${install_dir}/${data_file}) if (NOT EXISTS ${install_dir}/${data_file})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file}) inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
...@@ -97,6 +103,18 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode ...@@ -97,6 +103,18 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode
--iterations=2) --iterations=2)
endfunction() endfunction()
function(inference_analysis_api_lexical_test_run TARGET_NAME test_binary infer_model data_path)
inference_analysis_test_run(${TARGET_NAME}
COMMAND ${test_binary}
ARGS --infer_model=${infer_model}
--infer_data=${data_path}
--batch_size=50
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
--with_accuracy_layer=true
--use_analysis=true
--iterations=2)
endfunction()
function(preprocess_data2bin_test_run target py_script_source data_dir output_file) function(preprocess_data2bin_test_run target py_script_source data_dir output_file)
py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source} py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source}
ARGS --data_dir=${data_dir} ARGS --data_dir=${data_dir}
...@@ -114,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML) ...@@ -114,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML)
set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
else() else()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that # TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
...@@ -174,6 +193,8 @@ inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc ...@@ -174,6 +193,8 @@ inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true) ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
# text_classification # text_classification
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
...@@ -315,6 +336,20 @@ if(WITH_MKLDNN) ...@@ -315,6 +336,20 @@ if(WITH_MKLDNN)
download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" ) download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH}) inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
### Lexcial analysis GRU model
set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
# build test binary to be used in subsequent tests
inference_analysis_api_test_build(${LEXICAL_TEST_APP} ${LEXICAL_TEST_APP_SRC})
# run lexcial analysis test
inference_analysis_api_lexical_test_run(test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH})
### optimized FP32 vs. Quant INT8 tests ### optimized FP32 vs. Quant INT8 tests
set(QUANT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant") set(QUANT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
...@@ -439,19 +474,10 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -439,19 +474,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
endif() endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
endif() endif()
set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
......
...@@ -54,6 +54,9 @@ TEST(PD_AnalysisConfig, use_gpu) { ...@@ -54,6 +54,9 @@ TEST(PD_AnalysisConfig, use_gpu) {
PD_SwitchIrOptim(config, true); PD_SwitchIrOptim(config, true);
bool ir_optim = PD_IrOptim(config); bool ir_optim = PD_IrOptim(config);
CHECK(ir_optim) << "NO"; CHECK(ir_optim) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(!bfloat16_enable) << "NO";
PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false, PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
false); false);
bool trt_enable = PD_TensorrtEngineEnabled(config); bool trt_enable = PD_TensorrtEngineEnabled(config);
......
...@@ -88,6 +88,9 @@ TEST(PD_AnalysisConfig, profile_mkldnn) { ...@@ -88,6 +88,9 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
PD_EnableMkldnnQuantizer(config); PD_EnableMkldnnQuantizer(config);
bool quantizer_enable = PD_MkldnnQuantizerEnabled(config); bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
CHECK(quantizer_enable) << "NO"; CHECK(quantizer_enable) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(bfloat16_enable) << "NO";
PD_SetMkldnnCacheCapacity(config, 0); PD_SetMkldnnCacheCapacity(config, 0);
PD_SetModel(config, prog_file.c_str(), params_file.c_str()); PD_SetModel(config, prog_file.c_str(), params_file.c_str());
PD_DeleteAnalysisConfig(config); PD_DeleteAnalysisConfig(config);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册