提交 4496ab41 编写于 作者: S sandyhouse

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_timeline

...@@ -28,7 +28,10 @@ include(generic) # simplify cmake module ...@@ -28,7 +28,10 @@ include(generic) # simplify cmake module
# TODO(Shibo Tao): remove find_package(CUDA) completely. # TODO(Shibo Tao): remove find_package(CUDA) completely.
find_package(CUDA QUIET) find_package(CUDA QUIET)
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15)) if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. " message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
......
...@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ...@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU ARG WITH_GPU
ARG WITH_AVX ARG WITH_AVX
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_AVX=${WITH_AVX:-ON}
...@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 ...@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue. # version util jupyter fixes this issue.
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
...@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort ...@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 --no-cache-dir install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
RUN pip3 --no-cache-dir install coverage RUN pip3 --no-cache-dir install coverage
RUN pip3.6 --no-cache-dir install coverage RUN pip3.6 --no-cache-dir install coverage
RUN pip3.7 --no-cache-dir install coverage RUN pip3.7 --no-cache-dir install coverage
RUN pip --no-cache-dir install coverage RUN pip --no-cache-dir install coverage
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
...@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure] ...@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip --no-cache-dir install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq
RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
(cd /woboq \
cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-DCMAKE_BUILD_TYPE=Release . \
make)
# ar mishandles 4GB files # ar mishandles 4GB files
# https://sourceware.org/bugzilla/show_bug.cgi?id=14625 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
......
...@@ -33,7 +33,7 @@ pip install paddlepaddle ...@@ -33,7 +33,7 @@ pip install paddlepaddle
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.3.post97 pip install paddlepaddle-gpu==1.8.4.post97
``` ```
It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website. It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
......
...@@ -30,7 +30,7 @@ pip install paddlepaddle ...@@ -30,7 +30,7 @@ pip install paddlepaddle
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.3.post97 pip install paddlepaddle-gpu==1.8.4.post97
``` ```
更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html) 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)
......
...@@ -63,6 +63,11 @@ if(WITH_BOX_PS) ...@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS) add_definitions(-DPADDLE_WITH_BOX_PS)
endif() endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)
endif()
if(WITH_GPU) if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
......
...@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable) ...@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable)
if(NOT CUDA_gpu_detect_output) if(NOT CUDA_gpu_detect_output)
message(STATUS "Automatic GPU detection failed. Building for all known architectures.") message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
#Todo: fix Automatic GPU detection failed on windows
if(WIN32)
set(${out_variable} "61 75" PARENT_SCOPE)
endif()
else() else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif() endif()
...@@ -202,6 +206,11 @@ if (NOT WIN32) # windows msvc2015 support c++11 natively. ...@@ -202,6 +206,11 @@ if (NOT WIN32) # windows msvc2015 support c++11 natively.
set(CMAKE_CUDA_STANDARD 11) set(CMAKE_CUDA_STANDARD 11)
endif(NOT WIN32) endif(NOT WIN32)
# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
# So replace /W[1-4] with /W0
if (WIN32)
string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
endif(WIN32)
# in cuda9, suppress cuda warning on eigen # in cuda9, suppress cuda warning on eigen
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings # Set :expt-relaxed-constexpr to suppress Eigen warnings
......
...@@ -17,7 +17,7 @@ include(ExternalProject) ...@@ -17,7 +17,7 @@ include(ExternalProject)
set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub) set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub) set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
set(CUB_REPOSITORY https://github.com/NVlabs/cub.git) set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
set(CUB_TAG 1.9.8) set(CUB_TAG 1.8.0)
cache_third_party(extern_cub cache_third_party(extern_cub
REPOSITORY ${CUB_REPOSITORY} REPOSITORY ${CUB_REPOSITORY}
......
...@@ -14,13 +14,21 @@ ...@@ -14,13 +14,21 @@
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
SET(GLOO_PROJECT "extern_gloo") SET(GLOO_PROJECT "extern_gloo")
IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL)) IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
MESSAGE(STATUS "use pre defined download url") MESSAGE(STATUS "use pre defined download url")
SET(GLOO_VER "master" CACHE STRING "" FORCE) SET(GLOO_VER "master" CACHE STRING "" FORCE)
SET(GLOO_NAME "gloo" CACHE STRING "" FORCE) SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
SET(GLOO_URL "https://pslib.bj.bcebos.com/gloo.tar.gz" CACHE STRING "" FORCE)
if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
else()
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
endif()
ENDIF() ENDIF()
MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}") MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo") SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo")
SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}") SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
......
...@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) ...@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git) SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git)
SET(MKLDNN_TAG fb95345126ade4c54f5507e580a5f5da8d30a515) SET(MKLDNN_TAG 1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
# Introduce variables: # Introduce variables:
# * CMAKE_INSTALL_LIBDIR # * CMAKE_INSTALL_LIBDIR
......
if (NOT WITH_XPU)
return()
endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/api/include")
SET(XPU_RUNTIME_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_SIM_LIB_NAME "libxpusim.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(XPU_SIM_LIB "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
&& tar xvf xpu.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
# generate a static dummy target to track xpulib dependencies
# for cc_library(xxx SRCS xxx.c DEPS xpulib)
generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
...@@ -232,7 +232,9 @@ if(WIN32) ...@@ -232,7 +232,9 @@ if(WIN32)
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
set(flag_var "${flag_var} /w") endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var) endforeach(flag_var)
endif() endif()
...@@ -384,8 +384,12 @@ function(cc_test_run TARGET_NAME) ...@@ -384,8 +384,12 @@ function(cc_test_run TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes. # No unit test should exceed 2 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) if (APPLE OR WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
else()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif()
endif() endif()
endfunction() endfunction()
...@@ -742,9 +746,14 @@ function(py_test TARGET_NAME) ...@@ -742,9 +746,14 @@ function(py_test TARGET_NAME)
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
if (APPLE OR WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
else()
# No unit test should exceed 2 minutes in Linux.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif()
# No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
endif() endif()
endfunction() endfunction()
......
...@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST) ...@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib) DSTS ${dst_dir} ${dst_dir}/lib)
if (WITH_CRYPTO)
set(dst_dir "${DST}/third_party/install/cryptopp") set(dst_dir "${DST}/third_party/install/cryptopp")
copy(${TARGET} copy(${TARGET}
SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES} SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib) DSTS ${dst_dir} ${dst_dir}/lib)
endif()
set(dst_dir "${DST}/third_party/install/xxhash") set(dst_dir "${DST}/third_party/install/xxhash")
copy(${TARGET} copy(${TARGET}
...@@ -187,7 +189,7 @@ copy(inference_lib_dist ...@@ -187,7 +189,7 @@ copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal) DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/) DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
......
# Attention: cmake will append these flags to compile command automatically. # Attention: cmake will append these flags to compile command automatically.
# So if you want to add global option, change this file rather than flags.cmake # So if you want to add global option, change this file rather than flags.cmake
# default: "-g" # NOT WIN32
set(CMAKE_C_FLAGS_DEBUG "-g") # DEBUG: default: "-g"
# default: "-O3 -DNDEBUG" # RELEASE: default: "-O3 -DNDEBUG"
set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG") # RELWITHDEBINFO: default: "-O2 -g -DNDEBUG"
# default: "-O2 -g -DNDEBUG" # MINSIZEREL: default: "-O2 -g -DNDEBUG"
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
# default: "-Os -DNDEBUG" if(NOT WIN32)
set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG") set(CMAKE_C_FLAGS_DEBUG "-g")
set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
endif()
if(WITH_GPU)
set(CMAKE_CUDA_FLAGS_DEBUG "-g")
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
endif()
# default: "-g"
set(CMAKE_CXX_FLAGS_DEBUG "-g")
# default: "-O3 -DNDEBUG"
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
# default: "-O2 -g -DNDEBUG"
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
# default: "-Os -DNDEBUG"
set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
# default: "-g"
set(CMAKE_CUDA_FLAGS_DEBUG "-g")
# default: "-O3 -DNDEBUG"
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
# default: "-O2 -g -DNDEBUG"
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
# default: "-O1 -DNDEBUG"
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
...@@ -8,12 +8,13 @@ function(op_library TARGET) ...@@ -8,12 +8,13 @@ function(op_library TARGET)
set(hip_cu_srcs) set(hip_cu_srcs)
set(miopen_hip_cc_srcs) set(miopen_hip_cc_srcs)
set(cu_cc_srcs) set(cu_cc_srcs)
set(xpu_cc_srcs)
set(cudnn_cu_cc_srcs) set(cudnn_cu_cc_srcs)
set(cudnn_cu_srcs) set(cudnn_cu_srcs)
set(CUDNN_FILE) set(CUDNN_FILE)
set(mkldnn_cc_srcs) set(mkldnn_cc_srcs)
set(MKLDNN_FILE) set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
...@@ -60,6 +61,12 @@ function(op_library TARGET) ...@@ -60,6 +61,12 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
endif() endif()
endif() endif()
if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
endif()
endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.hip.cu$") if (${src} MATCHES ".*\\.hip.cu$")
...@@ -76,6 +83,8 @@ function(op_library TARGET) ...@@ -76,6 +83,8 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
else() else()
...@@ -109,7 +118,7 @@ function(op_library TARGET) ...@@ -109,7 +118,7 @@ function(op_library TARGET)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
endif() endif()
...@@ -118,7 +127,7 @@ function(op_library TARGET) ...@@ -118,7 +127,7 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op") "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -150,10 +159,11 @@ function(op_library TARGET) ...@@ -150,10 +159,11 @@ function(op_library TARGET)
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len) list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -179,6 +189,9 @@ function(op_library TARGET) ...@@ -179,6 +189,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif() endif()
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN # pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator # Append first implemented MKLDNN activation operator
...@@ -228,6 +241,7 @@ function(register_operators) ...@@ -228,6 +241,7 @@ function(register_operators)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}") string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS) list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len) list(LENGTH register_operators_DEPS register_operators_DEPS_len)
......
...@@ -250,6 +250,11 @@ if(WITH_GPU) ...@@ -250,6 +250,11 @@ if(WITH_GPU)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_XPU)
include(external/xpu) # download, build, install xpu
list(APPEND third_party_deps extern_xpu)
endif(WITH_XPU)
if(WITH_PSLIB) if(WITH_PSLIB)
include(external/pslib) # download, build, install pslib include(external/pslib) # download, build, install pslib
list(APPEND third_party_deps extern_pslib) list(APPEND third_party_deps extern_pslib)
...@@ -263,10 +268,6 @@ if(WITH_PSLIB) ...@@ -263,10 +268,6 @@ if(WITH_PSLIB)
endif() endif()
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
include(external/box_ps) include(external/box_ps)
...@@ -274,6 +275,11 @@ if(WITH_BOX_PS) ...@@ -274,6 +275,11 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS) endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC) if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc) list(APPEND third_party_deps extern_grpc)
else() else()
......
...@@ -27,6 +27,7 @@ add_subdirectory(fleet) ...@@ -27,6 +27,7 @@ add_subdirectory(fleet)
add_subdirectory(io) add_subdirectory(io)
#ddim lib #ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
proto_library(heter_service_proto SRCS heter_service.proto)
proto_library(data_feed_proto SRCS data_feed.proto) proto_library(data_feed_proto SRCS data_feed.proto)
proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
data_feed_proto) data_feed_proto)
...@@ -121,6 +122,10 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor ...@@ -121,6 +122,10 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context) device_context)
cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context) cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
...@@ -163,23 +168,23 @@ if(WITH_PYTHON) ...@@ -163,23 +168,23 @@ if(WITH_PYTHON)
if (NOT WIN32) if (NOT WIN32)
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else(NOT WIN32) else(NOT WIN32)
string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/") string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/") string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
COMMAND copy /Y *.py ${proto_dstpath} COMMAND copy /Y *.py ${proto_dstpath}
COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/fluid/proto."
COMMENT "Copy generated python proto into directory paddle/fleet/proto." COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32) endif(NOT WIN32)
endif() endif()
...@@ -195,20 +200,37 @@ cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc o ...@@ -195,20 +200,37 @@ cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc o
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper box_wrapper lodtensor_printer device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS} lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
graph_to_program_pass variable_helper data_feed_proto timer monitor) graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
elseif(WITH_PSLIB)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor pslib_brpc )
# TODO: Fix these unittest failed on Windows
if(NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
else() else()
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto trainer_desc_proto glog device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper box_wrapper lodtensor_printer feed_fetch_method lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor) graph_to_program_pass variable_helper timer monitor)
# TODO: Fix these unittest failed on Windows # TODO: Fix these unittest failed on Windows
if(NOT WIN32) if(NOT WIN32)
...@@ -250,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib ...@@ -250,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer) cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer) cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
cc_library(generator SRCS generator.cc)
# Get the current working branch # Get the current working branch
execute_process( execute_process(
......
...@@ -45,14 +45,35 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope, ...@@ -45,14 +45,35 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
// get CommContext and remote send and recv op // get CommContext and remote send and recv op
void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
// init communicator here
auto *instance = operators::distributed::Communicator::GetInstance(); bool need_communicator = false;
auto initialized = instance ? true : false;
PADDLE_ENFORCE_EQ(initialized, true, for (auto &node : graphs[0]->Nodes()) {
platform::errors::InvalidArgument( VLOG(3) << "node name " << node->Name();
"Communicator is not Initialized, you may use " if (node && node->IsOp()) {
"FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/" if (node->Name() == "send") {
"develop/markdown_doc/transpiler)")); auto send_varnames =
BOOST_GET_CONST(std::vector<std::string>,
node->Op()->GetNullableAttr("send_varnames"));
if (send_varnames.size() > 0) {
need_communicator = true;
break;
}
}
}
}
if (need_communicator) {
// init communicator here
auto *instance = operators::distributed::Communicator::GetInstance();
auto initialized = instance ? true : false;
PADDLE_ENFORCE_EQ(initialized, true,
platform::errors::InvalidArgument(
"Communicator is not Initialized, you may use "
"FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
"develop/markdown_doc/transpiler)"));
}
#endif #endif
} }
......
...@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item, ...@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
TensorCopy(src_item, platform::CPUPlace(), dst_item); TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif #endif
} else { } else {
dst_item->ShareDataWith(src_item); TensorCopy(src_item, platform::CPUPlace(), dst_item);
} }
} else { } else {
dst_item->clear(); dst_item->clear();
......
...@@ -27,6 +27,7 @@ limitations under the License. */ ...@@ -27,6 +27,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
...@@ -51,10 +52,23 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size); ...@@ -51,10 +52,23 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class FleetWrapper; class FleetWrapper;
#ifdef PADDLE_WITH_PSLIB
class HeterWrapper;
#endif
class PullDenseWorker { class PullDenseWorker {
public: public:
virtual ~PullDenseWorker() {} virtual ~PullDenseWorker() {}
virtual void Initialize(const TrainerDesc& param); virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
void AddPlace(const paddle::platform::Place place) {
places_.push_back(place);
}
void AddThreadScope(Scope* scope) { thread_scopes_.push_back(scope); }
#endif
int Start(); int Start();
void Stop(); void Stop();
void SetRootScope(Scope* scope) { root_scope_ = scope; } void SetRootScope(Scope* scope) { root_scope_ = scope; }
...@@ -62,6 +76,7 @@ class PullDenseWorker { ...@@ -62,6 +76,7 @@ class PullDenseWorker {
void ResetThreadVersion(uint64_t table_id); void ResetThreadVersion(uint64_t table_id);
void Wait(std::vector<::std::future<int32_t>>* status_vec); void Wait(std::vector<::std::future<int32_t>>* status_vec);
void PullDense(bool force_update = false); void PullDense(bool force_update = false);
void CreatePinVar();
int GetThreadIdByScope(const Scope* scope); int GetThreadIdByScope(const Scope* scope);
void SetThreadIdByScope(const Scope* scope, int tid); void SetThreadIdByScope(const Scope* scope, int tid);
static std::shared_ptr<PullDenseWorker> GetInstance() { static std::shared_ptr<PullDenseWorker> GetInstance() {
...@@ -105,6 +120,12 @@ class PullDenseWorker { ...@@ -105,6 +120,12 @@ class PullDenseWorker {
std::mutex mutex_for_mean_scale_; std::mutex mutex_for_mean_scale_;
float total_batch_num_ = 0; float total_batch_num_ = 0;
std::unordered_map<const Scope*, int> scope_to_thread_id_; std::unordered_map<const Scope*, int> scope_to_thread_id_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_;
#endif
}; };
// should incorporate different type of device // should incorporate different type of device
...@@ -126,6 +147,8 @@ class DeviceWorker { ...@@ -126,6 +147,8 @@ class DeviceWorker {
virtual void BindingDataFeedMemory() = 0; virtual void BindingDataFeedMemory() = 0;
virtual void SetRootScope(Scope* root_scope); virtual void SetRootScope(Scope* root_scope);
virtual void SetDataFeed(DataFeed* data_feed); virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void SetNeedDumpField(bool need_dump_field) { virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field; need_dump_field_ = need_dump_field;
} }
...@@ -161,6 +184,7 @@ class DeviceWorker { ...@@ -161,6 +184,7 @@ class DeviceWorker {
FetchConfig fetch_config_; FetchConfig fetch_config_;
bool use_cvm_; bool use_cvm_;
bool no_cvm_; bool no_cvm_;
TrainerDesc trainer_desc_;
// dump params or grads for debug // dump params or grads for debug
bool need_dump_param_; bool need_dump_param_;
...@@ -306,6 +330,87 @@ class DownpourWorkerOpt : public DownpourWorker { ...@@ -306,6 +330,87 @@ class DownpourWorkerOpt : public DownpourWorker {
uint64_t async_tid_ = 0; uint64_t async_tid_ = 0;
}; };
#ifdef PADDLE_WITH_PSLIB
class HeterCpuWorker : public HogwildWorker {
public:
HeterCpuWorker() {}
virtual ~HeterCpuWorker() {}
virtual void Initialize(const TrainerDesc& desc);
virtual void TrainFiles();
virtual void TrainFilesWithProfiler();
virtual void SetNeedDump(bool need_dump_field);
virtual void SetChannelWriter(ChannelObject<std::string>* queue);
virtual void SetWorkerNum(int num) { worker_num_ = num; }
virtual void Schedule(int taskid);
virtual void JumpContext(std::shared_ptr<HeterTask> task);
virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program);
}
virtual void GetXpuOpIndex();
protected:
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
void FillSparseValue(std::shared_ptr<HeterTask> task, size_t table_id);
void PushGradients();
void CollectLabelInfo(std::shared_ptr<HeterTask> task, size_t table_id);
void AdjustInsWeight(std::shared_ptr<HeterTask> task);
void DumpParam();
void CopySparseTable();
void CopyDenseTable();
void CopyDenseVars();
private:
int mpi_rank_;
int worker_num_;
int xpu_begin_op_index_;
int xpu_end_op_index_;
ProgramDesc program_;
HeterObjectPool<HeterTask> object_pool_;
HeterList<int, std::shared_ptr<HeterTask>> run_queue_;
HeterList<int, std::shared_ptr<HeterTask>> wait_queue_;
bool need_dump_param_;
std::vector<std::string> dump_param_;
bool need_to_push_dense_;
bool need_dump_field_;
bool dump_slot_;
bool need_to_push_sparse_;
std::vector<std::string> dump_fields_;
ChannelWriter<std::string> writer_;
DownpourWorkerParameter param_;
float scale_datanorm_;
// just save the value in param_ for easy access
std::map<uint64_t, std::string> label_var_name_;
std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
platform::Place root_place_;
// actually pushed feasign of each table
std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
// skipped ops
std::vector<std::string> skip_ops_;
std::vector<::std::future<int32_t>> push_sparse_status_;
std::vector<::std::future<int32_t>> push_dense_status_;
// adjust ins weight
AdjustInsWeightConfig adjust_ins_weight_config_;
std::vector<float> nid_show_;
// check nan and inf during training
std::vector<std::string> check_nan_var_names_;
// copy table
CopyTableConfig copy_table_config_;
std::map<uint64_t, uint64_t> table_dependency_;
std::vector<std::pair<uint64_t, uint64_t>> copy_sparse_tables_;
std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
};
#endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
class SectionWorker : public DeviceWorker { class SectionWorker : public DeviceWorker {
public: public:
......
...@@ -62,6 +62,9 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker( ...@@ -62,6 +62,9 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt); REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
#ifdef PADDLE_WITH_PSLIB
REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
#endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker); REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif #endif
......
...@@ -35,7 +35,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, ...@@ -35,7 +35,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
dump_file_num_ = trainer_desc.dump_file_num(); dump_file_num_ = trainer_desc.dump_file_num();
const std::vector<paddle::framework::DataFeed *> readers = const std::vector<paddle::framework::DataFeed *> readers =
dataset->GetReaders(); dataset->GetReaders();
RegisterHeterCallback();
thread_num_ = readers.size(); thread_num_ = readers.size();
workers_.resize(thread_num_); workers_.resize(thread_num_);
for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
...@@ -55,6 +55,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, ...@@ -55,6 +55,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
workers_[i]->SetDumpParamVector(dump_param_); workers_[i]->SetDumpParamVector(dump_param_);
workers_[i]->InitRandomDumpConfig(trainer_desc); workers_[i]->InitRandomDumpConfig(trainer_desc);
workers_[i]->Initialize(trainer_desc); workers_[i]->Initialize(trainer_desc);
workers_[i]->SetWorkerNum(thread_num_);
} }
VLOG(3) << "going to initialize pull dense worker"; VLOG(3) << "going to initialize pull dense worker";
...@@ -64,6 +65,13 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, ...@@ -64,6 +65,13 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
SetDebug(trainer_desc.debug()); SetDebug(trainer_desc.debug());
} }
void DistMultiTrainer::RegisterHeterCallback() {
auto fleet_ptr = FleetWrapper::GetInstance();
fleet_ptr->RegisterHeterCallback([this](int worker, int taskid) {
// workers_[worker]->Schedule(taskid);
});
}
void DistMultiTrainer::InitDumpEnv() { void DistMultiTrainer::InitDumpEnv() {
queue_ = paddle::framework::MakeChannel<std::string>(); queue_ = paddle::framework::MakeChannel<std::string>();
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
...@@ -90,6 +98,9 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program, ...@@ -90,6 +98,9 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
workers_[i]->SetRootScope(root_scope_); workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory(); workers_[i]->BindingDataFeedMemory();
#ifdef PADDLE_WITH_PSLIB
workers_[i]->CacheProgram(main_program);
#endif
} }
// Scope* -> thread id, it will be used in push_dense op // Scope* -> thread id, it will be used in push_dense op
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
...@@ -104,6 +115,11 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) { ...@@ -104,6 +115,11 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
} }
pull_dense_worker_->SetRootScope(root_scope_); pull_dense_worker_->SetRootScope(root_scope_);
pull_dense_worker_->Start(); pull_dense_worker_->Start();
#ifdef PADDLE_WITH_PSLIB
for (int i = 0; i < thread_num_; ++i) {
workers_[i]->GetXpuOpIndex();
}
#endif
VLOG(3) << "init other env done."; VLOG(3) << "init other env done.";
} }
......
...@@ -55,9 +55,8 @@ message LarsConfig { ...@@ -55,9 +55,8 @@ message LarsConfig {
} }
message LambConfig { message LambConfig {
optional float beta1 = 1 [ default = 0.001 ]; optional float lamb_weight_decay = 1 [ default = 0.01 ];
optional float beta2 = 2 [ default = 0.999 ]; repeated string exclude_from_weight_decay = 2;
optional float epsilon = 3 [ default = 0.000001 ];
} }
message BuildStrategy { message BuildStrategy {
...@@ -80,7 +79,7 @@ message ExecutionStrategy { ...@@ -80,7 +79,7 @@ message ExecutionStrategy {
} }
message AsyncConfig { message AsyncConfig {
optional int32 k_steps = 1 [ default = 1 ]; optional int32 k_steps = 1 [ default = -1 ];
optional int32 max_merge_var_num = 2 [ default = 1 ]; optional int32 max_merge_var_num = 2 [ default = 1 ];
optional int32 send_queue_size = 3 [ default = 16 ]; optional int32 send_queue_size = 3 [ default = 16 ];
optional bool independent_recv_thread = 4 [ default = false ]; optional bool independent_recv_thread = 4 [ default = false ];
...@@ -114,7 +113,9 @@ message DistributedStrategy { ...@@ -114,7 +113,9 @@ message DistributedStrategy {
optional bool fuse_all_reduce_ops = 18 [ default = true ]; optional bool fuse_all_reduce_ops = 18 [ default = true ];
optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ]; optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ]; optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
// optional bool enable_backward_optimizer_op_deps = 19 [ default = true ]; optional bool cudnn_exhaustive_search = 21 [ default = true ];
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional RecomputeConfig recompute_configs = 101; optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102; optional AMPConfig amp_configs = 102;
......
...@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
return ctx; return ctx;
} }
inline ::DLContext operator()(const platform::XPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const { inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
::DLContext ctx; ::DLContext ctx;
......
...@@ -379,7 +379,7 @@ void DownpourWorker::CopyDenseTable() { ...@@ -379,7 +379,7 @@ void DownpourWorker::CopyDenseTable() {
pull_dense_status.resize(0); pull_dense_status.resize(0);
fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table, fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table,
dense_value_names_[dest_table], dense_value_names_[dest_table],
&pull_dense_status); &pull_dense_status, true);
for (auto& t : pull_dense_status) { for (auto& t : pull_dense_status) {
t.wait(); t.wait();
auto status = t.get(); auto status = t.get();
......
...@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
int64_t max_memory_size = GetEagerDeletionThreshold(); int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
if (!ctx->force_disable_gc_ && max_memory_size >= 0) { if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
...@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
gc.reset(new DefaultStreamGarbageCollector( gc.reset(new DefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
} }
} else if (platform::is_cpu_place(place_)) { #else
PADDLE_THROW(
platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
#endif #endif
} else if (platform::is_cpu_place(place_)) {
gc.reset(new CPUGarbageCollector( gc.reset(new CPUGarbageCollector(
BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
#ifdef PADDLE_WITH_CUDA } else if (platform::is_xpu_place(place_)) {
} #ifdef PADDLE_WITH_XPU
gc.reset(new XPUGarbageCollector(
BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif #endif
}
} }
for (int64_t i = start_op_index; i < end_op_index; ++i) { for (int64_t i = start_op_index; i < end_op_index; ++i) {
......
...@@ -19,4 +19,6 @@ else() ...@@ -19,4 +19,6 @@ else()
cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_GLOO) endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
...@@ -154,6 +154,219 @@ void FleetWrapper::CreateClient2ClientConnection() { ...@@ -154,6 +154,219 @@ void FleetWrapper::CreateClient2ClientConnection() {
#endif #endif
} }
#ifdef PADDLE_WITH_PSLIB
void FleetWrapper::HeterPullSparseVars(
int workerid, std::shared_ptr<HeterTask> task, const uint64_t table_id,
const std::vector<std::string>& var_names, int fea_value_dim,
const std::vector<std::string>& var_emb_names) {
std::vector<::std::future<int32_t>> pull_sparse_status;
pull_sparse_status.resize(0);
auto& scope = *(task->scope_);
auto& fea_keys = (task->features_)[table_id];
auto& fea_values = (task->feature_values_)[table_id];
fea_keys.clear();
for (size_t var_index = 0; var_index < var_names.size(); ++var_index) {
const std::string& name = var_names[var_index];
Variable* var = scope.FindVar(name);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
int64_t* ids = tensor->data<int64_t>();
size_t len = tensor->numel();
// skip slots which do not have embedding
const std::string& emb_name = var_emb_names[var_index];
Variable* emb_var = scope.FindVar(emb_name);
if (emb_var == nullptr) {
continue;
}
for (auto i = 0u; i < len; ++i) {
if (ids[i] == 0u) {
continue;
}
fea_keys.push_back(static_cast<uint64_t>(ids[i]));
}
}
fea_values.resize(fea_keys.size() + 1);
for (auto& t : fea_values) {
t.resize(fea_value_dim);
}
std::vector<float*> pull_result_ptr;
for (auto& t : fea_values) {
pull_result_ptr.push_back(t.data());
}
auto status = pslib_ptr_->_worker_ptr->heter_pull_sparse(
workerid, pull_result_ptr.data(), table_id, fea_keys.data(),
fea_keys.size(), task->taskid_);
pull_sparse_status.push_back(std::move(status));
for (auto& t : pull_sparse_status) {
t.wait();
auto status = t.get();
if (status != 0) {
LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
sleep(sleep_seconds_before_fail_exit_);
exit(-1);
}
}
}
void FleetWrapper::HeterPushSparseVars(
std::shared_ptr<HeterTask> task, const uint64_t table_id,
const std::vector<std::string>& sparse_key_names,
const std::vector<std::string>& sparse_grad_names, const int emb_dim,
std::vector<::std::future<int32_t>>* push_sparse_status, const bool use_cvm,
const bool dump_slot, const bool no_cvm) {
auto& scope = *(task->scope_);
int batch_size = task->cur_batch_;
int offset = 2;
int slot_offset = 0;
int grad_dim = emb_dim;
int show_index = 0;
int click_index = 1;
auto& fea_keys = (task->features_)[table_id];
auto& fea_labels = (task->feature_labels_)[table_id];
auto& push_values = (task->feature_grads_)[table_id];
auto& sparse_push_keys = (task->sparse_push_keys_)[table_id];
if (use_cvm) {
offset = 0;
grad_dim = emb_dim - 2;
}
if (no_cvm) {
offset = 0;
grad_dim = emb_dim;
}
if (dump_slot) {
slot_offset = 1;
show_index = 1;
click_index = 2;
}
CHECK_GE(grad_dim, 0);
sparse_push_keys.clear();
sparse_push_keys.reserve(fea_keys.size() + 1);
push_values.resize(fea_keys.size() + 1);
for (auto& t : push_values) {
t.resize(emb_dim + offset + slot_offset);
}
uint64_t fea_idx = 0u;
for (size_t i = 0;
i < sparse_key_names.size() && i < sparse_grad_names.size(); ++i) {
Variable* var = scope.FindVar(sparse_key_names[i]);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == nullptr) {
LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
exit(-1);
}
size_t len = tensor->numel();
int64_t* ids = tensor->data<int64_t>();
int slot = 0;
if (dump_slot) {
slot = boost::lexical_cast<int>(sparse_key_names[i]);
}
Variable* g_var = scope.FindVar(sparse_grad_names[i]);
if (g_var == nullptr) {
continue;
}
LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
if (g_tensor == nullptr) {
LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
exit(-1);
}
float* g = g_tensor->data<float>();
if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
int dim = emb_dim + offset;
Eigen::Map<
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
g_mat(g, g_tensor->numel() / dim, dim);
g_mat.rightCols(grad_dim) *= batch_size;
}
for (auto id_idx = 0u; id_idx < len; ++id_idx) {
if (ids[id_idx] == 0) {
g += emb_dim;
continue;
}
sparse_push_keys.push_back(ids[id_idx]);
CHECK(fea_idx < push_values.size());
if (use_cvm || no_cvm) {
memcpy(push_values[fea_idx].data() + offset + slot_offset, g,
sizeof(float) * emb_dim);
} else {
CHECK(fea_idx < fea_labels.size());
memcpy(push_values[fea_idx].data() + offset + slot_offset, g,
sizeof(float) * emb_dim);
push_values[fea_idx][show_index] = 1.0f;
push_values[fea_idx][click_index] =
static_cast<float>(fea_labels[fea_idx]);
}
if (dump_slot) {
push_values[fea_idx][0] = static_cast<float>(slot);
}
g += emb_dim;
fea_idx++;
}
}
// slots whose embedding has been stop gradient or
// not involved in forward-backward
uint64_t no_grad_fea_num = 0u;
for (size_t i = sparse_grad_names.size(); i < sparse_key_names.size(); ++i) {
Variable* var = scope.FindVar(sparse_key_names[i]);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == nullptr) {
LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
exit(-1);
}
size_t len = tensor->numel();
int64_t* ids = tensor->data<int64_t>();
for (auto id_idx = 0u; id_idx < len; ++id_idx) {
if (ids[id_idx] == 0) {
continue;
}
++no_grad_fea_num;
}
}
CHECK(fea_idx + no_grad_fea_num == fea_keys.size())
<< "fea_idx: " << fea_idx << " no_grad_fea_num: " << no_grad_fea_num
<< " features size: " << fea_keys.size();
CHECK(fea_idx == sparse_push_keys.size());
if (fea_idx == 0) {
return;
}
std::vector<float*> push_g_vec;
for (auto i = 0u; i < sparse_push_keys.size(); ++i) {
push_g_vec.push_back(push_values[i].data());
}
auto status = pslib_ptr_->_worker_ptr->push_sparse(
table_id, sparse_push_keys.data(), (const float**)push_g_vec.data(),
sparse_push_keys.size());
push_sparse_status->push_back(std::move(status));
}
#endif
int FleetWrapper::RegisterHeterCallback(HeterCallBackFunc handler) {
#ifdef PADDLE_WITH_PSLIB
VLOG(3) << "calling FleetWrapper::RegisterHeterCallback";
VLOG(3) << "pslib_ptr_=" << pslib_ptr_;
VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr;
return pslib_ptr_->_worker_ptr->registe_heter_callback(handler);
#else
VLOG(0) << "FleetWrapper::RegisterHeterCallback"
<< " does nothing when no pslib";
#endif
return 0;
}
void FleetWrapper::PullSparseToLocal(const uint64_t table_id, void FleetWrapper::PullSparseToLocal(const uint64_t table_id,
int fea_value_dim) { int fea_value_dim) {
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
...@@ -421,13 +634,17 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, ...@@ -421,13 +634,17 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
void FleetWrapper::PullDenseVarsAsync( void FleetWrapper::PullDenseVarsAsync(
const Scope& scope, const uint64_t tid, const Scope& scope, const uint64_t tid,
const std::vector<std::string>& var_names, const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* pull_dense_status) { std::vector<::std::future<int32_t>>* pull_dense_status, bool in_cpu) {
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto& regions = _regions[tid]; auto& regions = _regions[tid];
regions.clear(); regions.clear();
regions.resize(var_names.size()); regions.resize(var_names.size());
for (auto i = 0u; i < var_names.size(); ++i) { for (auto i = 0u; i < var_names.size(); ++i) {
Variable* var = scope.FindVar(var_names[i]); std::string varname = var_names[i];
if (!in_cpu) {
varname = var_names[i] + "pin";
}
Variable* var = scope.FindVar(varname);
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>(); float* w = tensor->data<float>();
paddle::ps::Region reg(w, tensor->numel()); paddle::ps::Region reg(w, tensor->numel());
...@@ -485,6 +702,57 @@ void FleetWrapper::PushDenseVarsSync( ...@@ -485,6 +702,57 @@ void FleetWrapper::PushDenseVarsSync(
Scope* scope, const uint64_t table_id, Scope* scope, const uint64_t table_id,
const std::vector<std::string>& var_names) {} const std::vector<std::string>& var_names) {}
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size, const paddle::platform::Place& place,
cudaStream_t stream, cudaEvent_t event) {
std::vector<paddle::ps::Region> regions;
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int count = tensor->numel();
float* g_data = tensor->data<float>();
Variable* pin_var = scope.FindVar(t + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace());
memory::Copy(platform::CUDAPinnedPlace(), pin_g,
BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
sizeof(float) * count, stream);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event);
float* g = pin_g;
if (scale_datanorm >= 0) {
if (t.find(".batch_size@GRAD") != std::string::npos ||
t.find(".batch_sum@GRAD") != std::string::npos) {
Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
float scale = 1.0 / batch_size;
mat *= scale;
} else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
VLOG(3) << "epsilon: " << scale_datanorm;
for (int i = 0; i < count; ++i) {
g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
batch_size * scale_datanorm;
}
}
}
paddle::ps::Region reg(g, count);
regions.emplace_back(std::move(reg));
}
auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
regions.size(), table_id);
if (push_sparse_status) {
push_sparse_status->push_back(std::move(status));
}
}
#endif
void FleetWrapper::PushDenseVarsAsync( void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names, const std::vector<std::string>& var_names,
...@@ -1085,8 +1353,8 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope, ...@@ -1085,8 +1353,8 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
push_status.wait(); push_status.wait();
auto status = push_status.get(); auto status = push_status.get();
if (status != 0) { if (status != 0) {
PADDLE_THORW(platform::errors::Fatal( // PADDLE_THORW(platform::errors::Fatal(
"push shrink dense param failed, status is [%d].", status)); // "push shrink dense param failed, status is [%d].", status));
sleep(sleep_seconds_before_fail_exit_); sleep(sleep_seconds_before_fail_exit_);
exit(-1); exit(-1);
} }
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -80,6 +81,24 @@ class FleetWrapper { ...@@ -80,6 +81,24 @@ class FleetWrapper {
pull_local_thread_num_ = thread_num; pull_local_thread_num_ = thread_num;
} }
#ifdef PADDLE_WITH_PSLIB
void HeterPullSparseVars(int workerid, std::shared_ptr<HeterTask> task,
const uint64_t table_id,
const std::vector<std::string>& var_names,
int fea_dim,
const std::vector<std::string>& var_emb_names);
void HeterPushSparseVars(
std::shared_ptr<HeterTask> task, const uint64_t table_id,
const std::vector<std::string>& sparse_key_names,
const std::vector<std::string>& sparse_grad_names, const int emb_dim,
std::vector<::std::future<int32_t>>* push_sparse_status,
const bool use_cvm, const bool dump_slot, const bool no_cvm);
#endif
typedef std::function<void(int, int)> HeterCallBackFunc;
int RegisterHeterCallback(HeterCallBackFunc handler);
// Pull sparse variables from server in sync mode // Pull sparse variables from server in sync mode
// Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names // Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names
// Param<out>: fea_values // Param<out>: fea_values
...@@ -118,15 +137,24 @@ class FleetWrapper { ...@@ -118,15 +137,24 @@ class FleetWrapper {
void PullDenseVarsAsync( void PullDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names, const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* pull_dense_status); std::vector<::std::future<int32_t>>* pull_dense_status, bool in_cpu);
// push dense parameters(not gradients) to server in sync mode // push dense parameters(not gradients) to server in sync mode
void PushDenseParamSync(const Scope& scope, const uint64_t table_id, void PushDenseParamSync(const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names); const std::vector<std::string>& var_names);
// Push dense variables to server in async mode // Push dense variables to server in async mode
// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
// Param<out>: push_sparse_status // Param<out>: push_sparse_status
#ifdef PADDLE_WITH_CUDA
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place, cudaStream_t stream,
cudaEvent_t event);
#endif
void PushDenseVarsAsync( void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names, const std::vector<std::string>& var_names,
......
...@@ -54,9 +54,8 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) { ...@@ -54,9 +54,8 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
paddle::framework::fs_remove(tmp); paddle::framework::fs_remove(tmp);
if (i == retry_times_) { if (i == retry_times_) {
VLOG(0) << "fs_open_write failed, retry times reaches limit"; VLOG(0) << "fs_open_write failed, retry times reaches limit";
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"fs_open_write failed, retry times reaches" "fs_open_write failed, retry times reaches %d limit.",
" limit ",
retry_times_)); retry_times_));
} }
} else { } else {
...@@ -143,7 +142,7 @@ void HdfsStore::wait(const std::vector<std::string>& keys, ...@@ -143,7 +142,7 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
break; break;
} }
} }
PADDLE_THROW(platform::errors::ExecutionTimeout( PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
"TIMEOUT self_rank = %d pair_rank = %d", self_rank_, "TIMEOUT self_rank = %d pair_rank = %d", self_rank_,
last_check_rank)); last_check_rank));
} }
......
...@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP }; ...@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP };
class GlooWrapper { class GlooWrapper {
public: public:
static std::shared_ptr<GlooWrapper> GetInstance() {
static auto s_instance = std::make_shared<GlooWrapper>();
return s_instance;
}
GlooWrapper() {} GlooWrapper() {}
virtual ~GlooWrapper() {} virtual ~GlooWrapper() {}
...@@ -153,6 +158,11 @@ class GlooWrapper { ...@@ -153,6 +158,11 @@ class GlooWrapper {
#endif #endif
} }
bool IsInitialized() { return is_initialized_; }
#ifdef PADDLE_WITH_GLOO
std::shared_ptr<gloo::Context> GetContext() { return context_; }
#endif
template <typename T> template <typename T>
std::vector<T> AllReduce(std::vector<T>& sendbuf, // NOLINT std::vector<T> AllReduce(std::vector<T>& sendbuf, // NOLINT
const std::string& mode = "sum") { // NOLINT const std::string& mode = "sum") { // NOLINT
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/timer.h"
#ifdef PADDLE_WITH_PSLIB
namespace paddle {
namespace framework {
std::shared_ptr<HeterWrapper> HeterWrapper::s_instance_ = NULL;
bool HeterWrapper::is_initialized_ = false;
void HeterWrapper::CreateClient2XpuConnection() {
brpc::ChannelOptions options;
options.protocol = "baidu_std";
options.connection_type = "single";
options.timeout_ms = 2000000;
xpu_channels_.resize(xpu_list_.size());
for (size_t i = 0; i < xpu_list_.size(); ++i) {
VLOG(3) << "channel init: " << xpu_list_[i];
xpu_channels_[i].reset(new brpc::Channel());
if (xpu_channels_[i]->Init(xpu_list_[i].c_str(), "", &options) != 0) {
VLOG(0) << "server channel init fail";
}
}
}
void HeterWrapper::RegisterServiceHandler(int cmd, HeterServiceHandler func) {
service_.RegisterServiceHandler(cmd, func);
}
void HeterWrapper::SetXpuList(const std::vector<std::string>& xpu_list) {
#ifdef PADDLE_WITH_PSLIB
VLOG(3) << "Going to set xpu list";
for (auto& x : xpu_list) {
xpu_list_.push_back(x);
VLOG(3) << "set xpu list: " << x << " size: " << xpu_list_.size();
}
#endif
}
void HeterWrapper::StartXpuService(const std::string& ip, uint32_t port) {
std::string ip_port = ip + ":" + std::to_string(port);
VLOG(3) << "xpu server starts at " << ip_port;
server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
brpc::ServerOptions options;
if (server_.Start(ip_port.c_str(), &options) != 0) {
VLOG(0) << "xpu server start fail";
}
}
// void HeterWrapper::SerializeToReq(const std::string& varname,
// Scope* scope, HeterRequest& request) {
// auto* req_var = request.mutable_vars();
void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
VariableMessage* req_var) {
Variable* var = scope->FindVar(varname);
if (var == nullptr) {
return;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
req_var->set_varname(varname);
req_var->set_type(LOD_TENSOR);
req_var->set_data_type(static_cast<VariableMessage::Type>(tensor->type()));
for (auto& dim : framework::vectorize(tensor->dims())) {
req_var->add_dims(dim);
}
const framework::LoD lod = tensor->lod();
if (lod.size() > 0) {
req_var->set_lod_level(lod.size());
for (auto& each : lod) {
VariableMessage::LodData* lod_inner = req_var->add_lod();
for (auto& d : each) {
lod_inner->add_lod_data(d);
}
}
}
auto* req_data = req_var->mutable_data();
req_data->clear();
req_data->resize(tensor->numel() * SizeOfType(tensor->type()));
char* data_ptr = const_cast<char*>(req_data->data());
if (platform::is_cpu_place(tensor->place())) {
memcpy(data_ptr, tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
}
#ifdef PADDLE_WITH_CUDA
else {
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()), nullptr);
}
#endif
}
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place,
cudaStream_t stream) {
#else
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place) {
#endif
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
std::vector<int> vec_dim;
for (auto& x : req_var.dims()) {
vec_dim.push_back(x);
}
tensor->Resize(make_ddim(vec_dim));
LoD lod;
for (int i = 0; i < req_var.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
v.push_back(req_var.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
#else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
framework::proto::VarType::Type HeterWrapper::ToVarType(
VariableMessage::Type type) {
switch (type) {
case VariableMessage::FP32:
return framework::proto::VarType::FP32; // NOLINT
case VariableMessage::FP64:
return framework::proto::VarType::FP64; // NOLINT
case VariableMessage::INT32:
return framework::proto::VarType::INT32; // NOLINT
case VariableMessage::INT64:
return framework::proto::VarType::INT64; // NOLINT
case VariableMessage::BOOL:
return framework::proto::VarType::BOOL; // NOLINT
default:
VLOG(0) << "Not support type " << type;
}
}
void HeterWrapper::StopXpuService(int num) {
HeterRequest request;
HeterResponse response;
brpc::Controller cntl;
request.set_cmd(2);
// for (size_t i = 0; i < xpu_channels_.size(); ++i) {
HeterService_Stub stub(xpu_channels_[num].get());
stub.service(&cntl, &request, &response, NULL);
if (cntl.Failed()) {
VLOG(0) << "call stop xpu service fail: " << cntl.ErrorText();
} else {
VLOG(3) << "call stop xpu service success";
}
// }
}
void HeterWrapper::EndPass(Scope* scope, int num) {
HeterRequest request;
HeterResponse response;
brpc::Controller cntl;
request.set_cmd(1);
// for (size_t i = 0; i < xpu_channels_.size(); ++i) {
HeterService_Stub stub(xpu_channels_[num].get());
stub.service(&cntl, &request, &response, NULL);
if (cntl.Failed()) {
VLOG(0) << "call end pass fail: " << cntl.ErrorText();
} else {
VLOG(3) << "call end pass success";
for (int j = 0; j < response.vars_size(); ++j) {
DeSerializeToTensor(scope, response.vars(j), platform::CPUPlace());
}
}
// }
}
void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
HeterCpuWorker* worker, int mpi_rank,
std::vector<std::string>& send_vars) {
HeterRequest request;
request.set_cmd(0);
request.set_cur_batch(task->cur_batch_);
OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
auto* closure = (OnHeterRpcDone*)done;
if (closure->cntl.Failed()) {
VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
} else {
VLOG(3) << "call xpu success";
}
// DeSerializeToTensor(task->scope_,
// closure->response.vars(), platform::CPUPlace());
for (int i = 0; i < closure->response.vars_size(); ++i) {
DeSerializeToTensor(task->scope_, closure->response.vars(i),
platform::CPUPlace());
}
worker->Schedule(task->taskid_);
});
// std::vector<std::string> varnames = {"click", "12345"};
// //varnames.push_back(send_var);
// //if (send_var == "_generated_var_412") {
// varnames.push_back("filter_by_instag_0.tmp_0");
// varnames.push_back("filter_by_instag_2.tmp_0");
// varnames.push_back("filter_by_instag_0.tmp_1");
// varnames.push_back("concat_1.tmp_0");
// }
for (auto& varname : send_vars) {
auto* req_var = request.add_vars();
SerializeToReq(varname, task->scope_, req_var);
}
int num = mpi_rank % xpu_channels_.size();
HeterService_Stub stub(xpu_channels_[num].get());
// stub.service(&cntl, &request, &response,
// brpc::NewCallback(&HeterWrapper::RpcCallBack,
// response, cntl, worker, task));
stub.service(&done->cntl, &request, &done->response, done);
}
void HeterWrapper::CallRemoteXpuSync(std::shared_ptr<HeterTask> task,
HeterCpuWorker* worker, int mpi_rank,
std::vector<std::string>& send_vars) {
HeterRequest request;
HeterResponse response;
brpc::Controller cntl;
request.set_cmd(0);
request.set_cur_batch(task->cur_batch_);
// std::vector<std::string> varnames = {"concat_1.tmp_0", "click", "12345"};
for (auto& varname : send_vars) {
auto* req_var = request.add_vars();
SerializeToReq(varname, task->scope_, req_var);
}
HeterService_Stub stub(xpu_channels_[0].get());
stub.service(&cntl, &request, &response, NULL);
if (cntl.Failed()) {
VLOG(0) << "call xpu fail: " << cntl.ErrorText();
} else {
VLOG(3) << "call xpu success";
for (int i = 0; i < response.vars_size(); ++i) {
DeSerializeToTensor(task->scope_, response.vars(i), platform::CPUPlace());
}
}
}
} // end namespace framework
} // end namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <ctime>
#include <map>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
#ifdef PADDLE_WITH_PSLIB
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
class HeterCpuWorker;
typedef std::function<void(void*)> HeterRpcCallbackFunc;
class OnHeterRpcDone : public google::protobuf::Closure {
public:
OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
virtual ~OnHeterRpcDone() {}
void Run() {
std::unique_ptr<OnHeterRpcDone> self_guard(this);
handler_(this);
}
HeterRpcCallbackFunc handler_;
HeterResponse response;
brpc::Controller cntl;
};
class HeterWrapper {
public:
virtual ~HeterWrapper() {
server_.Stop(1000);
server_.Join();
}
HeterWrapper() {}
static void HeterRpcCallBack(HeterResponse* response, brpc::Controller* cntl,
HeterCpuWorker* worker,
std::shared_ptr<HeterTask> task);
void CreateClient2XpuConnection();
void RegisterServiceHandler(int cmd, HeterServiceHandler func);
void StartXpuService(const std::string& ip, uint32_t port);
void CallRemoteXpu(std::shared_ptr<HeterTask> task, HeterCpuWorker* worker,
int mpi_rank, std::vector<std::string>& send_vars);
void CallRemoteXpuSync(std::shared_ptr<HeterTask> task,
HeterCpuWorker* worker, int mpi_rank,
std::vector<std::string>& send_vars);
void StopXpuService(int num);
void EndPass(Scope* scope, int num);
void SerializeToReq(const std::string& varname, Scope* scope,
VariableMessage* req_var);
framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
#ifdef PADDLE_WITH_CUDA
void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
platform::Place place,
cudaStream_t stream = nullptr);
#else
void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
platform::Place place);
#endif
// HeterWrapper singleton
static std::shared_ptr<HeterWrapper> GetInstance() {
if (NULL == s_instance_) {
s_instance_.reset(new paddle::framework::HeterWrapper());
}
return s_instance_;
}
std::vector<std::string>& GetXpuList() { return xpu_list_; }
void SetXpuList(const std::vector<std::string>& xpu_list);
private:
static std::shared_ptr<HeterWrapper> s_instance_;
protected:
std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
brpc::Server server_;
HeterXpuService service_;
static bool is_initialized_;
DISABLE_COPY_AND_ASSIGN(HeterWrapper);
std::vector<std::string> xpu_list_;
};
} // end namespace framework
} // end namespace paddle
#endif
...@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) { ...@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback(); callback();
} }
#ifdef PADDLE_WITH_XPU
XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback();
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
const platform::CUDAPlace &place, size_t max_memory_size) const platform::CUDAPlace &place, size_t max_memory_size)
......
...@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector { ...@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
void ClearCallback(const std::function<void()> &callback) override; void ClearCallback(const std::function<void()> &callback) override;
}; };
#ifdef PADDLE_WITH_XPU
class XPUGarbageCollector : public GarbageCollector {
public:
XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class UnsafeFastGPUGarbageCollector : public GarbageCollector { class UnsafeFastGPUGarbageCollector : public GarbageCollector {
public: public:
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <deque>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/generator.h"
namespace paddle {
namespace framework {
std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
GeneratorState* Generator::GetState() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_.get();
}
void Generator::SetState(GeneratorState* state_in) {
std::lock_guard<std::mutex> lock(this->mutex);
*this->state_ = *state_in;
}
uint64_t Generator::GetCurrentSeed() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->current_seed;
}
uint64_t Generator::Seed() {
std::lock_guard<std::mutex> lock(this->mutex);
uint64_t seed;
std::random_device de;
seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
this->state_->current_seed = seed;
std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq);
return this->state_->current_seed;
}
void Generator::SetCurrentSeed(uint64_t seed) {
std::lock_guard<std::mutex> lock(this->mutex);
this->state_->current_seed = uint64_t(seed);
std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq);
}
std::mt19937_64& Generator::GetCPUEngine() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->cpu_engine;
}
void Generator::SetCPUEngine(std::mt19937_64 engine) {
std::lock_guard<std::mutex> lock(this->mutex);
this->state_->cpu_engine = std::mt19937_64(engine);
}
uint64_t Generator::Random64() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->cpu_engine();
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <atomic>
#include <deque>
#include <iostream> // temp for debug
#include <memory>
#include <mutex> // NOLINT
#include <random>
#include <typeinfo>
#include <utility>
namespace paddle {
namespace framework {
struct GeneratorState {
int64_t device = -1;
uint64_t current_seed = 34342423252;
std::mt19937_64 cpu_engine;
};
struct Generator {
Generator() {
GeneratorState default_gen_state_cpu;
default_gen_state_cpu.device = -1;
default_gen_state_cpu.current_seed = 34342423252;
std::seed_seq seq({34342423252});
default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
}
explicit Generator(GeneratorState state_in)
: state_{std::make_shared<GeneratorState>(state_in)} {}
Generator(const Generator& other)
: Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
// get random state
GeneratorState* GetState();
// set random state
void SetState(GeneratorState* state_in);
// get current seed
uint64_t GetCurrentSeed();
// random a seed and get
uint64_t Seed();
// set seed
void SetCurrentSeed(uint64_t seed);
// get cpu engine
std::mt19937_64& GetCPUEngine();
// set cpu engine
void SetCPUEngine(std::mt19937_64 engine);
uint64_t Random64();
bool is_init_py = false;
// CPU Generator singleton
static std::shared_ptr<Generator> GetInstance() {
if (NULL == gen_instance_) {
gen_instance_.reset(new paddle::framework::Generator());
}
return gen_instance_;
}
static std::shared_ptr<Generator> GetInstanceX() {
if (NULL == gen_instance_) {
gen_instance_.reset(new paddle::framework::Generator());
}
gen_instance_->is_init_py = true;
return gen_instance_;
}
private:
static std::shared_ptr<Generator> gen_instance_;
std::shared_ptr<GeneratorState> state_;
mutable std::mutex mutex;
Generator(const Generator& other, const std::lock_guard<std::mutex>&)
: state_(std::make_shared<GeneratorState>(*(other.state_))) {}
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <thread> // NOLINT
#include <unordered_map> // NOLINT
#include <unordered_set> // NOLINT
#include <vector>
#include "paddle/fluid/framework/heter_service.pb.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_PSLIB
#include "brpc/channel.h"
#include "brpc/controller.h"
#include "brpc/server.h"
namespace paddle {
namespace framework {
typedef std::function<int(const HeterRequest*, HeterResponse*)>
HeterServiceHandler;
class DataFeed;
class HeterXpuService : public HeterService {
public:
HeterXpuService() {}
virtual ~HeterXpuService() {}
void service(::google::protobuf::RpcController* controller,
const HeterRequest* request, HeterResponse* response,
::google::protobuf::Closure* done) {
brpc::ClosureGuard done_guard(done);
int ret = 0;
int cmd = request->cmd();
auto itr = handler_map_.find(cmd);
if (itr == handler_map_.end()) {
} else {
ret = itr->second(request, response);
}
// response->set_err_code(0);
// response->set_err_msg("");
if (ret != 0) {
// response->set_err_code(-1);
// response->set_err_msg("xpu service error");
}
}
void RegisterServiceHandler(int cmd, HeterServiceHandler func) {
VLOG(0) << "register heter service";
handler_map_[cmd] = func;
}
private:
std::unordered_map<int, HeterServiceHandler> handler_map_;
};
enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE };
class HeterTask {
public:
void Update() {
if (state_ == PULL_SPARSE) {
state_ = OP_RUN;
} else if (state_ == OP_RUN) {
state_ = XPU;
// state_ = PUSH_GRAD;
// state_ = PUSH_GRAD;
} else if (state_ == XPU) {
state_ = OP_RUN_END;
} else if (state_ == OP_RUN_END) {
state_ = PUSH_GRAD;
} else if (state_ == PUSH_GRAD) {
state_ = DONE;
}
}
void Reset() {
total_time = 0;
read_time = 0;
pack_time = 0;
pull_sparse_local_time = 0;
op_all_time = 0;
xpu_op_time = 0;
xpu_wait_time = 0;
cpu_op_time = 0;
collect_label_time = 0;
fill_sparse_time = 0;
push_sparse_time = 0;
}
void Show() {
std::cout << "features size " << features_.size() << std::endl;
for (size_t i = 0; i < features_.size(); ++i) {
std::cout << "features[" << i << "] size " << features_[i].size()
<< std::endl;
}
}
void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch,
const ProgramDesc& program);
Scope* scope_{nullptr};
int taskid_;
int cur_batch_;
HeterTaskState state_;
// cache
std::map<uint64_t, std::vector<uint64_t>> features_;
std::map<uint64_t, std::vector<float>> feature_labels_;
std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
double total_time{0};
double read_time{0};
double pack_time{0};
double pull_sparse_local_time{0};
double op_all_time{0};
double xpu_op_time{0};
double xpu_wait_time{0};
double cpu_op_time{0};
double collect_label_time{0};
double fill_sparse_time{0};
double push_sparse_time{0};
};
template <class T>
class HeterObjectPool {
public:
HeterObjectPool() {}
virtual ~HeterObjectPool(){};
std::shared_ptr<T> Get() {
std::lock_guard<std::mutex> lock(mutex_);
if (pool_.empty()) {
num_ += 1;
#ifdef PADDLE_WITH_CUDA
VLOG(0) << "pool construct size: " << num_;
#endif
return std::make_shared<T>();
} else {
auto ret = pool_.back();
pool_.pop_back();
return ret;
}
}
void Push(std::shared_ptr<T> data) {
std::lock_guard<std::mutex> lock(mutex_);
pool_.push_back(std::move(data));
}
int Size() {
std::lock_guard<std::mutex> lock(mutex_);
return pool_.size();
}
std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
private:
std::vector<std::shared_ptr<T>> pool_;
std::mutex mutex_;
int num_{0};
};
struct BthreadMutextGuard {
BthreadMutextGuard(bthread_mutex_t* rho) {
mutex_ = rho;
bthread_mutex_lock(mutex_);
}
~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); }
bthread_mutex_t* mutex_;
};
template <class T>
class BtObjectPool {
public:
BtObjectPool() {
bthread_mutex_init(&mutex_, NULL);
bthread_cond_init(&cond_, NULL);
}
virtual ~BtObjectPool() {
bthread_cond_destroy(&cond_);
bthread_mutex_destroy(&mutex_);
};
std::shared_ptr<T> Get() {
BthreadMutextGuard guard(&mutex_);
while (pool_.empty()) {
bthread_cond_wait(&cond_, &mutex_);
}
auto ret = pool_.back();
pool_.pop_back();
return ret;
}
void Push(std::shared_ptr<T> data) {
BthreadMutextGuard guard(&mutex_);
pool_.push_back(std::move(data));
bthread_cond_signal(&cond_);
}
int Size() { return pool_.size(); }
std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
private:
std::vector<std::shared_ptr<T>> pool_;
bthread_mutex_t mutex_;
bthread_cond_t cond_;
int num_{0};
};
template <class K, class T>
struct HeterNode {
K key;
T value;
HeterNode* prev;
HeterNode* next;
};
template <class K, class T>
class HeterList {
public:
HeterList() : head_(new HeterNode<K, T>), tail_(new HeterNode<K, T>) {
head_->prev = NULL;
head_->next = tail_;
tail_->prev = head_;
tail_->next = NULL;
size = 0;
cap_ = 1e9;
}
~HeterList() {
delete head_;
delete tail_;
}
void SetCap(int num) { cap_ = num; }
bool TryPut(K& key, T& value) {
std::unique_lock<std::mutex> lock(mutex_);
cond_.wait(lock, [this] { return size < cap_; });
if (task_map_.find(key) != task_map_.end()) {
// std::cout << "try put key=" << key << " false" << std::endl;
task_map_.erase(key);
return false;
} else {
HeterNode<K, T>* node = new HeterNode<K, T>;
node->key = key;
node->value = value;
map_[node->key] = node;
attach(node);
// std::cout << "try put key=" << key << " true" << std::endl;
return true;
}
}
bool Put(K& key, T& value) {
std::unique_lock<std::mutex> lock(mutex_);
cond_.wait(lock, [this] { return size < cap_; });
HeterNode<K, T>* node = new HeterNode<K, T>;
// std::cout << "put key=" << key << " true" << std::endl;
node->key = key;
node->value = value;
map_[node->key] = node;
attach(node);
return true;
}
T TryGet(const K& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto iter = map_.find(key);
if (iter != map_.end()) {
// std::cout << "try get key=" << key << " true" << std::endl;
HeterNode<K, T>* node = iter->second;
detach(node);
cond_.notify_one();
T ret = std::move(node->value);
map_.erase(key);
delete node;
return ret;
}
task_map_.insert(key);
// std::cout << "try get key=" << key << " false" << std::endl;
return nullptr;
}
T Get(const K& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto iter = map_.find(key);
if (iter != map_.end()) {
// std::cout << "get key=" << key << " true" << std::endl;
HeterNode<K, T>* node = iter->second;
detach(node);
cond_.notify_one();
T ret = std::move(node->value);
map_.erase(key);
delete node;
return ret;
}
// std::cout << "get key=" << key << " false" << std::endl;
return nullptr;
}
T Get() {
std::lock_guard<std::mutex> lock(mutex_);
HeterNode<K, T>* node = head_->next;
if (node == tail_) {
// std::cout << "get2 false" << std::endl;
return nullptr;
} else {
detach(node);
cond_.notify_one();
T ret = std::move(node->value);
map_.erase(node->key);
// std::cout << "get2 key=" << node->key << " true" << std::endl;
delete node;
return ret;
}
}
bool Empty() {
std::lock_guard<std::mutex> lock(mutex_);
return head_->next == tail_;
}
int Size() {
std::lock_guard<std::mutex> lock(mutex_);
return size;
}
private:
void detach(HeterNode<K, T>* node) {
node->prev->next = node->next;
node->next->prev = node->prev;
size--;
}
void attach(HeterNode<K, T>* node) {
node->prev = head_;
node->next = head_->next;
head_->next->prev = node;
head_->next = node;
size++;
}
private:
HeterNode<K, T>* head_;
HeterNode<K, T>* tail_;
std::unordered_map<K, HeterNode<K, T>*> map_;
std::unordered_set<K> task_map_;
std::mutex mutex_;
std::condition_variable cond_;
int cap_;
int size;
};
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.framework;
option cc_generic_services = true;
// It can be: LoDTensor、SelectedRows or NCCL_ID
enum VarType {
LOD_TENSOR = 0;
SELECTED_ROWS = 1;
NCCL_ID = 2;
}
// VariableMessage is serialized paddle variable message.
// NOTICE(gongwb):don't modify this proto if you are not
// not familar with how we serialize in sendrecvop_utils.h
// and deserilize it in variable_response.h.
message VariableMessage {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LodData { repeated int64 lod_data = 1; }
optional string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
optional VarType type = 2;
// bool persistable is not needed for sending.
// tensor info:
optional Type data_type = 3;
repeated int64 dims = 4;
// lod details:
optional int64 lod_level = 5;
repeated LodData lod = 6;
// selected_rows height, aka. original dim0
optional int64 slr_height = 7;
// tensor data
optional bytes data = 8;
}
message HeterRequest {
required int32 cmd = 1;
optional int32 cur_batch = 2;
repeated VariableMessage vars = 3;
};
message HeterResponse {
// optional VariableMessage vars = 1;
repeated VariableMessage vars = 1;
};
service HeterService { rpc service(HeterRequest) returns (HeterResponse); };
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace paddle {
namespace framework {
void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
Dataset* dataset) {
srand((unsigned)time(NULL));
param_ = trainer_desc.downpour_param();
for (int i = 0; i < param_.dense_table_size(); ++i) {
uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
auto table = param_.dense_table(i);
dense_grad_names_[table_id].resize(table.dense_grad_name_size());
for (int j = 0; j < table.dense_grad_name_size(); ++j) {
dense_grad_names_[table_id][j] = table.dense_grad_name(j);
}
}
scale_datanorm_ = trainer_desc.scale_datanorm();
int place_num = trainer_desc.worker_places_size();
for (int i = 0; i < place_num; ++i) {
int num = trainer_desc.worker_places(i);
platform::CUDAPlace place = platform::CUDAPlace(num);
platform::CUDADeviceGuard guard(place.device);
cudaStream_t stream;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
copy_streams_.push_back(stream);
places_.push_back(place);
cudaEvent_t event;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
events_.push_back(event);
}
// thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset);
// dump_fields_path_ = trainer_desc.dump_fields_path();
// dump_converter_ = trainer_desc.dump_converter();
// need_dump_field_ = false;
// if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
// need_dump_field_ = true;
// }
// if (need_dump_field_) {
// auto &file_list = dataset->GetFileList();
// if (file_list.size() == 0) {
// need_dump_field_ = false;
// }
// }
// mpi_rank_ = trainer_desc.mpi_rank();
// mpi_size_ = trainer_desc.mpi_size();
// dump_file_num_ = trainer_desc.dump_file_num();
// const std::vector<paddle::framework::DataFeed *> readers =
// dataset->GetReaders();
// thread_num_ = readers.size();
for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
i++) {
need_merge_var_names_.push_back(
trainer_desc.downpour_param().stat_var_names(i));
}
running_ = true;
VLOG(3) << "going to initialize pull dense worker";
pull_dense_worker_ = PullDenseWorker::GetInstance();
pull_dense_worker_->Initialize(trainer_desc);
VLOG(3) << "initialize pull dense worker";
SetDebug(trainer_desc.debug());
fleet_ptr_ = FleetWrapper::GetInstance();
heter_ptr_ = HeterWrapper::GetInstance();
RegisterServiceHandler();
// for (int i = 0; i < trainer_desc.worker_places_size(); ++i) {
// int num = trainer_desc.worker_places(i);
// platform::CUDAPlace place = platform::CUDAPlace(num);
// platform::CUDADeviceGuard guard(place.device);
// cudaStream_t stream;
// PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
// copy_streams_.push_back(stream);
// places_.push_back(place);
// }
trainer_desc_ = trainer_desc;
}
void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
auto place = places_[num];
Scope* scope = place_scopes_[num];
auto stream = copy_streams_[num];
auto event = events_[num];
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
auto& block = program.Block(0);
for (auto& var : block.AllVars()) {
if (var->Persistable()) {
auto name = var->Name();
Variable* root_var = root_scope_->FindVar(name);
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
auto* ptr = scope->Var(name);
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* thread_tensor = ptr->GetMutable<LoDTensor>();
#define HeterMemcpyFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \
} while (0)
_ForEachDataType_(HeterMemcpyFunc);
}
}
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event);
}
template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream) {
T* thread_ptr =
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel(), stream);
} else {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()),
root_ptr, sizeof(T) * root_tensor->numel(), stream);
}
}
void HeterXpuTrainer::DumpWork(int tid) {}
void HeterXpuTrainer::InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) {
CacheProgram(main_program);
place_ = place;
auto& profiler = paddle::ps::CostProfiler::instance();
profiler.register_profiler("xpu_service_run_task");
profiler.register_profiler("xpu_service_deserial");
profiler.register_profiler("xpu_service_launch_kernel");
profiler.register_profiler("xpu_service_wait");
}
void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
auto& block = main_program.Block(0);
pull_dense_worker_->SetRootScope(root_scope_);
pull_dense_worker_->CreatePinVar();
for (size_t i = 0; i < places_.size(); ++i) {
Scope* scope = &(root_scope_->NewScope());
// for (auto &var : block.AllVars()) {
// if (var->Persistable()) {
// auto *ptr = scope->Var(var->Name());
// InitializeVariable(ptr, var->GetType());
// }
// }
place_scopes_.push_back(scope);
CreateThreadParam(main_program, i);
pull_dense_worker_->AddThreadScope(scope);
pull_dense_worker_->AddPlace(places_[i]);
pull_dense_worker_->AddStream(copy_streams_[i]);
}
pull_dense_worker_->Start();
for (auto& stream : copy_streams_) {
cudaStreamSynchronize(stream);
}
op_names_.clear();
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
op_names_.push_back(op_desc->Type());
OperatorBase* local_op_ptr = local_op.release();
ops_.push_back(local_op_ptr);
continue;
}
xpu_begin_op_index_ = xpu_end_op_index_ = -1;
xpu_begin_op_index_ = trainer_desc_.xpu_start_idx();
xpu_end_op_index_ = trainer_desc_.xpu_end_idx();
VLOG(0) << "xpu begin: " << xpu_begin_op_index_
<< " xpu end: " << xpu_end_op_index_;
// CHECK(xpu_begin_op_index_ == 0);
// CHECK(xpu_end_op_index_ = ops_.size() - 1);
//// init pool
for (size_t i = 0; i < 6; ++i) {
for (size_t j = 0; j < places_.size(); ++j) {
int num = j;
std::shared_ptr<HeterServiceContext> context =
std::make_shared<HeterServiceContext>();
context->place_num_ = num;
auto place = places_[num];
context->scope_ = &(place_scopes_[num]->NewScope());
auto& block = program_.Block(0);
for (auto& var : block.AllVars()) {
if (!var->Persistable()) {
auto* ptr = context->scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
}
}
for (auto& v : dense_grad_names_) {
for (auto& name : v.second) {
auto* ptr = context->scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
}
}
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr);
}
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
object_pool_.Push(context);
}
}
VLOG(3) << "init other env done.";
}
void HeterXpuTrainer::Run() {}
int HeterXpuTrainer::EndPass(const HeterRequest* request,
HeterResponse* response) {
// int scope_num = object_pool_.Size();
for (size_t i = 0; i < need_merge_var_names_.size(); i++) {
Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]);
if (root_var == nullptr) {
continue;
}
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
for (size_t j = 0; j < place_scopes_.size(); j++) {
Scope* cur_thread_scope = place_scopes_[j];
Variable* thread_var =
cur_thread_scope->FindVar(need_merge_var_names_[i]);
if (thread_var == nullptr) {
continue;
}
LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
// if (root_tensor->numel() != thread_tensor->numel()) {
// continue;
// }
#define MergeCallback(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
if (thread_tensor->type() != proto_type) { \
VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
<< "] " << need_merge_var_names_[i] \
<< ", root tensor type=" << root_tensor->type() \
<< ", thread tensor type=" << thread_tensor->type(); \
exit(-1); \
} \
MergeToRootScope<cpp_type>(root_tensor, thread_tensor); \
} \
} while (0)
_ForEachDataType_(MergeCallback);
if (platform::is_gpu_place(thread_tensor->place())) {
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
} else {
memset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
}
}
auto* merge_var = response->add_vars();
heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
merge_var);
if (platform::is_gpu_place(root_tensor->place())) {
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
} else {
memset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
}
}
return 0;
}
template <typename T>
void HeterXpuTrainer::MergeToRootScope(LoDTensor* root_tensor,
LoDTensor* tensor) {
LoDTensor tmp_root;
TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
T* tmp_root_data = tmp_root.data<T>();
LoDTensor tmp_tensor;
TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
T* data = tmp_tensor.data<T>();
for (int i = 0; i < tmp_tensor.numel(); i++) {
tmp_root_data[i] += data[i];
}
TensorCopy(tmp_root, root_tensor->place(), root_tensor);
}
int HeterXpuTrainer::StopService(const HeterRequest* request,
HeterResponse* response) {
std::unique_lock<std::mutex> lock(mutex_);
running_ = false;
cond_.notify_one();
return 0;
}
int HeterXpuTrainer::RunTask(const HeterRequest* request,
HeterResponse* response) {
auto timer = std::make_shared<paddle::ps::CostTimer>("xpu_service_run_task");
std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
if (!context->scope_) {
int num = rand() % places_.size();
context->place_num_ = num;
auto place = places_[num];
context->scope_ = &(place_scopes_[num]->NewScope());
auto& block = program_.Block(0);
for (auto& var : block.AllVars()) {
if (!var->Persistable()) {
auto* ptr = context->scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
}
}
for (auto& v : dense_grad_names_) {
for (auto& name : v.second) {
auto* ptr = context->scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
}
}
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr);
}
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
}
context->Reset();
auto place = places_[context->place_num_];
{
auto deserial_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
for (int i = 0; i < request->vars_size(); ++i) {
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
copy_streams_[context->place_num_]);
}
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
while (cudaEventQuery(context->event_) != cudaSuccess) {
VLOG(3) << "wait for kernel";
bthread_yield();
}
}
{
auto launch_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_launch_kernel");
for (int i = xpu_begin_op_index_; i <= xpu_end_op_index_; ++i) {
auto& op = (context->ops_)[i];
op->Run(*(context->scope_), place);
}
}
auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(context->event_, dev_ctx->stream()));
// cudaEventSynchronize(context->event_);
{
auto wait_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_wait");
while (cudaEventQuery(context->event_) != cudaSuccess) {
VLOG(3) << "wait for kernel";
bthread_yield();
}
}
for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
const std::string& varname = trainer_desc_.xpu_send_list(i);
// CHECK(varname == "concat_1.tmp_0@GRAD");
auto* res_var = response->add_vars();
heter_ptr_->SerializeToReq(varname, context->scope_, res_var);
}
// std::string varname = "concat_1.tmp_0@GRAD";
//
// auto* res_var = response->add_vars();
// heter_ptr_->SerializeToReq(varname, context->scope_, res_var);
for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
++i) {
uint64_t tid =
static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_], copy_streams_[context->place_num_],
context->event_);
}
for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
++i) {
uint64_t tid =
static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
pull_dense_worker_->IncreaseThreadVersion(0, tid);
}
VLOG(3) << "push dense gradient done.";
context->scope_->DropKids();
object_pool_.Push(context);
VLOG(0) << "pool size " << object_pool_.Size();
return 0;
}
void HeterXpuTrainer::RegisterServiceHandler() {
heter_ptr_->RegisterServiceHandler(
0, [this](const HeterRequest* request, HeterResponse* response) -> int {
return this->RunTask(request, response);
});
heter_ptr_->RegisterServiceHandler(
1, [this](const HeterRequest* request, HeterResponse* response) -> int {
return this->EndPass(request, response);
});
heter_ptr_->RegisterServiceHandler(
2, [this](const HeterRequest* request, HeterResponse* response) -> int {
return this->StopService(request, response);
});
}
Scope* HeterXpuTrainer::GetWorkerScope(int thread_id) { return nullptr; }
void HeterXpuTrainer::Finalize() {
// for (auto &th : threads_) {
// th.join();
// }
std::unique_lock<std::mutex> lock(mutex_);
cond_.wait(lock, [this] { return !running_; });
sleep(3);
pull_dense_worker_->Stop();
root_scope_->DropKids();
}
} // namespace framework
} // namespace paddle
#endif
...@@ -16,9 +16,6 @@ ...@@ -16,9 +16,6 @@
#include "paddle/fluid/framework/io/crypto/aes_cipher.h" #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
#include "paddle/fluid/framework/io/crypto/cipher_utils.h" #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifdef ON_INFER
#include "paddle/fluid/inference/api/paddle_api.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -59,7 +56,7 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher( ...@@ -59,7 +56,7 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher(
} }
} // namespace framework } // namespace framework
#ifdef ON_INFER #ifdef PADDLE_ON_INFERENCE
std::shared_ptr<framework::Cipher> MakeCipher(const std::string& config_file) { std::shared_ptr<framework::Cipher> MakeCipher(const std::string& config_file) {
return framework::CipherFactory::CreateCipher(config_file); return framework::CipherFactory::CreateCipher(config_file);
} }
......
...@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass, ...@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass,
paddle::framework::ir::ConvTransposeBNFusePass); paddle::framework::ir::ConvTransposeBNFusePass);
REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass, REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass); paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass);
REGISTER_PASS(depthwise_conv_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvBNFusePass);
REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
...@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass { ...@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
std::string conv_type() const { return "conv2d_transpose"; } std::string conv_type() const { return "conv2d_transpose"; }
}; };
class DepthwiseConvBNFusePass : public ConvBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};
class DepthwiseConvEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -68,11 +68,35 @@ static bool HasInput(Node* n, std::string name) { ...@@ -68,11 +68,35 @@ static bool HasInput(Node* n, std::string name) {
return input_names_set.find(name) != input_names_set.end(); return input_names_set.find(name) != input_names_set.end();
} }
static Node* GetInputVar(Node* n, const std::string& name) {
PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
platform::errors::InvalidArgument(
"Expected node %p to be an operator node.", n));
for (auto* in : n->inputs) {
if (in->Name() == name) {
return in;
}
}
return nullptr;
}
static Node* GetOutputVar(Node* n, const std::string& name) {
PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
platform::errors::InvalidArgument(
"Expected node %p to be an operator node.", n));
for (auto* out : n->outputs) {
if (out->Name() == name) {
return out;
}
}
return nullptr;
}
std::vector<OperationExpression> CodeGenerator::ConvertToExpressions( std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
SubGraph* subgraph) { SubGraph* subgraph) {
std::unordered_map<std::string, int> var_ids = EncodeVarNodes(subgraph); std::unordered_map<Node*, int> var_ids = EncodeVarNodes(subgraph);
std::vector<Node*> intermediate_out_nodes = std::unordered_set<Node*> intermediate_out_vars_set =
subgraph->GetIntermediateOutVarNodes(); subgraph->GetIntermediateOutVarNodesSet();
std::vector<OperationExpression> expressions; std::vector<OperationExpression> expressions;
for (auto* node : subgraph->SortedNodes()) { for (auto* node : subgraph->SortedNodes()) {
if (node && node->IsOp() && node->Op()) { if (node && node->IsOp() && node->Op()) {
...@@ -92,11 +116,12 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions( ...@@ -92,11 +116,12 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// "elementwise_add_grad", where "X", "Y" and "Out" are not used. // "elementwise_add_grad", where "X", "Y" and "Out" are not used.
if ((HasInput(node, name) && op->Input(name).size() >= 1U)) { if ((HasInput(node, name) && op->Input(name).size() >= 1U)) {
for (size_t i = 0; i < op->Input(name).size(); i++) { for (size_t i = 0; i < op->Input(name).size(); i++) {
Node* input_var = GetInputVar(node, op->Input(name)[i]);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
var_ids.find(op->Input(name)[i]), var_ids.end(), var_ids.find(input_var), var_ids.end(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(%s) of operation %s is not set.", name, op->Type())); "Input(%s) of operation %s is not set.", name, op->Type()));
input_ids.push_back(var_ids[op->Input(name)[i]]); input_ids.push_back(var_ids[input_var]);
} }
} else { } else {
input_ids.push_back(-1); input_ids.push_back(-1);
...@@ -106,31 +131,29 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions( ...@@ -106,31 +131,29 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// Output ids should be set in fixed order, like: // Output ids should be set in fixed order, like:
// - dx, dy in backward operations // - dx, dy in backward operations
std::vector<int> output_ids; std::vector<int> output_ids;
std::vector<int> intermediate_output_ids;
std::vector<std::string> output_names = std::vector<std::string> output_names =
OperationMap::Instance().Get(op->Type()).output_names; OperationMap::Instance().Get(op->Type()).output_names;
std::unordered_map<int, bool> intermediate_state;
for (auto& name : output_names) { for (auto& name : output_names) {
Node* output_var = GetOutputVar(node, op->Output(name)[0]);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
var_ids.find(op->Output(name)[0]), var_ids.end(), var_ids.find(output_var), var_ids.end(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Output(%s) of operation %s is not set.", name, op->Type())); "Output(%s) of operation %s is not set.", name, op->Type()));
output_ids.push_back(var_ids[op->Output(name)[0]]); output_ids.push_back(var_ids[output_var]);
bool enable_intermediate = false; if (!subgraph->SaveIntermediateOut() &&
for (auto* n : intermediate_out_nodes) { intermediate_out_vars_set.find(output_var) !=
if (n->Name() == op->Output(name)[0]) { intermediate_out_vars_set.end()) {
enable_intermediate = true; intermediate_output_ids.push_back(var_ids[output_var]);
break;
}
} }
intermediate_state[var_ids[op->Output(name)[0]]] = enable_intermediate;
} }
std::string lhs_type = ExtractDataType(node->outputs); std::string lhs_type = ExtractDataType(node->outputs);
std::string rhs_type = ExtractDataType(node->inputs); std::string rhs_type = ExtractDataType(node->inputs);
auto expression = auto expression =
OperationExpression(node->Name(), input_ids, output_ids, rhs_type, OperationExpression(node->Name(), input_ids, output_ids, rhs_type,
lhs_type, intermediate_state); lhs_type, intermediate_output_ids);
expression.SetAttr(attr); expression.SetAttr(attr);
expressions.push_back(expression); expressions.push_back(expression);
} }
...@@ -146,17 +169,18 @@ std::string CodeGenerator::Generate( ...@@ -146,17 +169,18 @@ std::string CodeGenerator::Generate(
// TODO(liuyiqun): Check whether all expressions are elementwise operations. // TODO(liuyiqun): Check whether all expressions are elementwise operations.
std::set<int> input_ids = std::move(DistilInputIds(expressions)); std::set<int> input_ids = std::move(DistilInputIds(expressions));
std::set<int> output_ids = std::move(DistilOutputIds(expressions)); std::set<int> output_ids = std::move(DistilOutputIds(expressions));
std::set<int> intermediate_ids = std::set<int> intermediate_output_ids =
std::move(DistilIntermediateIds(expressions)); std::move(DistilIntermediateIds(expressions));
std::unordered_map<int, std::string> dtypes = std::unordered_map<int, std::string> dtypes =
std::move(DistilDtypes(expressions)); std::move(DistilDtypes(expressions));
TemplateVariable template_var; TemplateVariable template_var;
template_var.Add("func_name", func_name); template_var.Add("func_name", func_name);
template_var.Add("parameters", EmitParameters(input_ids, output_ids, template_var.Add(
intermediate_ids, dtypes)); "parameters",
EmitParameters(input_ids, output_ids, intermediate_output_ids, dtypes));
template_var.Add("compute_body", template_var.Add("compute_body",
EmitComputeBody(expressions, input_ids, output_ids, EmitComputeBody(expressions, input_ids, output_ids,
intermediate_ids, dtypes)); intermediate_output_ids, dtypes));
std::set<std::string> all_dtype; std::set<std::string> all_dtype;
for (const auto& type : dtypes) { for (const auto& type : dtypes) {
...@@ -204,18 +228,14 @@ std::set<int> CodeGenerator::DistilOutputIds( ...@@ -204,18 +228,14 @@ std::set<int> CodeGenerator::DistilOutputIds(
std::set<int> CodeGenerator::DistilIntermediateIds( std::set<int> CodeGenerator::DistilIntermediateIds(
const std::vector<OperationExpression>& expressions) { const std::vector<OperationExpression>& expressions) {
std::set<int> intermediate_ids; std::set<int> intermediate_output_ids;
// Use std::set to remove the reptead id and get a ordered list. // Use std::set to remove the reptead id and get a ordered list.
for (size_t i = 0; i < expressions.size(); i++) { for (size_t i = 0; i < expressions.size(); i++) {
for (auto id : expressions[i].GetOutputIds()) { for (auto id : expressions[i].GetIntermediateOutputIds()) {
auto intermediate_state = expressions[i].GetIntermediateState(); intermediate_output_ids.insert(id);
if (intermediate_state.find(id) != intermediate_state.end() &&
intermediate_state[id]) {
intermediate_ids.insert(id);
}
} }
} }
return intermediate_ids; return intermediate_output_ids;
} }
std::unordered_map<int, std::string> CodeGenerator::DistilDtypes( std::unordered_map<int, std::string> CodeGenerator::DistilDtypes(
...@@ -316,26 +336,29 @@ std::string CodeGenerator::EmitComputeBody( ...@@ -316,26 +336,29 @@ std::string CodeGenerator::EmitComputeBody(
return load.str() + compute.str() + store.str(); return load.str() + compute.str() + store.str();
} }
std::unordered_map<std::string, int> CodeGenerator::EncodeVarNodes( std::unordered_map<Node*, int> CodeGenerator::EncodeVarNodes(
SubGraph* subgraph) { SubGraph* subgraph) {
const auto& input_var_nodes = subgraph->GetInputVarNodes(); const auto& input_var_nodes = subgraph->GetInputVarNodes();
const auto& output_var_nodes = subgraph->GetOutputVarNodes(); // Encode all var nodes, including intermediate output var nodes.
const auto& output_var_nodes = subgraph->GetOutputVarNodes(true);
int id = 0; int id = 0;
std::unordered_map<std::string, int> var_ids; std::unordered_map<Node*, int> var_ids;
// Numbering input vars. // Numbering input vars.
for (auto* in : input_var_nodes) { for (auto* in : input_var_nodes) {
VLOG(3) << "Encoding input names:" << in->Name() << ", id:" << id; VLOG(3) << "Encoding input names:" << in->Name() << "(" << in
if (var_ids.find(in->Name()) == var_ids.end()) { << "), id:" << id;
var_ids[in->Name()] = id++; if (var_ids.find(in) == var_ids.end()) {
var_ids[in] = id++;
} }
} }
// Encoding output vars. // Encoding output vars.
for (auto* out : output_var_nodes) { for (auto* out : output_var_nodes) {
VLOG(3) << "Ecoding output names:" << out->Name() << ", id:" << id; VLOG(3) << "Ecoding output names:" << out->Name() << "(" << out
if (var_ids.find(out->Name()) == var_ids.end()) { << "), id:" << id;
var_ids[out->Name()] = id++; if (var_ids.find(out) == var_ids.end()) {
var_ids[out] = id++;
} }
} }
return var_ids; return var_ids;
......
...@@ -61,7 +61,7 @@ class CodeGenerator { ...@@ -61,7 +61,7 @@ class CodeGenerator {
const std::unordered_map<int, std::string>& dtypes) const; const std::unordered_map<int, std::string>& dtypes) const;
// Encode all var nodes in the subgraph with an unique number. // Encode all var nodes in the subgraph with an unique number.
std::unordered_map<std::string, int> EncodeVarNodes(SubGraph* subgraph); std::unordered_map<Node*, int> EncodeVarNodes(SubGraph* subgraph);
private: private:
std::vector<CodeTemplate> code_templates_; std::vector<CodeTemplate> code_templates_;
......
...@@ -48,20 +48,20 @@ class OperationExpression { ...@@ -48,20 +48,20 @@ class OperationExpression {
std::string op_type, const std::vector<int>& input_ids, std::string op_type, const std::vector<int>& input_ids,
const std::vector<int>& output_ids, std::string rhs_type, const std::vector<int>& output_ids, std::string rhs_type,
std::string lhs_type, std::string lhs_type,
const std::unordered_map<int, bool>& intermediate_state = {}) const std::vector<int>& intermediate_output_ids = {})
: op_type_(op_type), : op_type_(op_type),
input_ids_(input_ids), input_ids_(input_ids),
output_ids_(output_ids), output_ids_(output_ids),
rhs_type_(rhs_type), rhs_type_(rhs_type),
lhs_type_(lhs_type), lhs_type_(lhs_type),
intermediate_state_(intermediate_state) {} intermediate_output_ids_(intermediate_output_ids) {}
std::string GetOpType() const { return op_type_; } std::string GetOpType() const { return op_type_; }
std::unordered_map<int, bool> GetIntermediateState() const {
return intermediate_state_;
}
std::vector<int> GetInputIds() const { return input_ids_; } std::vector<int> GetInputIds() const { return input_ids_; }
std::vector<int> GetOutputIds() const { return output_ids_; } std::vector<int> GetOutputIds() const { return output_ids_; }
std::vector<int> GetIntermediateOutputIds() const {
return intermediate_output_ids_;
}
std::string GetRHSType() const { return rhs_type_; } std::string GetRHSType() const { return rhs_type_; }
std::string GetLHSType() const { return lhs_type_; } std::string GetLHSType() const { return lhs_type_; }
void SetAttr(AttributeMap attr) { attr_ = attr; } void SetAttr(AttributeMap attr) { attr_ = attr; }
...@@ -84,7 +84,7 @@ class OperationExpression { ...@@ -84,7 +84,7 @@ class OperationExpression {
AttributeMap attr_; AttributeMap attr_;
std::string rhs_type_; std::string rhs_type_;
std::string lhs_type_; std::string lhs_type_;
std::unordered_map<int, bool> intermediate_state_; std::vector<int> intermediate_output_ids_;
}; };
class TemplateVariable { class TemplateVariable {
......
...@@ -144,7 +144,6 @@ void CheckOutput(const std::vector<OperationExpression>& expressions, ...@@ -144,7 +144,6 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
LOG(INFO) << "Precision check failed from i = " << id LOG(INFO) << "Precision check failed from i = " << id
<< ", expect: " << expect << ", actual: " << actual; << ", expect: " << expect << ", actual: " << actual;
EXPECT_LT(fabs(actual - expect), eps); EXPECT_LT(fabs(actual - expect), eps);
break;
} }
} }
} }
...@@ -465,7 +464,7 @@ TEST(code_generator, subgraph) { ...@@ -465,7 +464,7 @@ TEST(code_generator, subgraph) {
for (std::string dtype : {"float", "__half"}) { for (std::string dtype : {"float", "__half"}) {
std::unique_ptr<paddle::framework::ir::Graph> graph = std::unique_ptr<paddle::framework::ir::Graph> graph =
BuildGraph(false, dtype); BuildGraph(false, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false, fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
graph->Nodes()); graph->Nodes());
// Expressions generated by code_generator (they may be different): // Expressions generated by code_generator (they may be different):
...@@ -484,7 +483,7 @@ TEST(code_generator, subgraph_grad) { ...@@ -484,7 +483,7 @@ TEST(code_generator, subgraph_grad) {
for (std::string dtype : {"float", "__half"}) { for (std::string dtype : {"float", "__half"}) {
std::unique_ptr<paddle::framework::ir::Graph> graph = std::unique_ptr<paddle::framework::ir::Graph> graph =
BuildGraph(true, dtype); BuildGraph(true, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false, fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
DistilGradNodes(graph)); DistilGradNodes(graph));
// Expressions generated by code_generator (they may be different): // Expressions generated by code_generator (they may be different):
......
...@@ -63,7 +63,7 @@ static bool IsEqualAndNotEmpty(const std::vector<int64_t>& l, ...@@ -63,7 +63,7 @@ static bool IsEqualAndNotEmpty(const std::vector<int64_t>& l,
bool GroupDetector::CheckPrecondition(const Node* n) { bool GroupDetector::CheckPrecondition(const Node* n) {
auto check_data_type = [&](const std::vector<Node*>& nodes) -> bool { auto check_data_type = [&](const std::vector<Node*>& nodes) -> bool {
bool is_first = true; bool is_first = true;
proto::VarType::Type data_type_0; proto::VarType::Type data_type_0 = proto::VarType::BOOL;
for (auto* n : nodes) { for (auto* n : nodes) {
if (n && n->IsVar() && n->Var()) { if (n && n->IsVar() && n->Var()) {
if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) { if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) {
......
...@@ -63,11 +63,6 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { ...@@ -63,11 +63,6 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
std::unordered_set<Node*>(vec.begin(), vec.end())); std::unordered_set<Node*>(vec.begin(), vec.end()));
VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n"; VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n";
// In elementwise fused kernel, memory is the bound of execution,
// here we remove the output id to use less memory and less time.
if (subgraph.RemoveIntermediateOut()) {
subgraph.DetectIntermediateOutWithGraph(graph);
}
if (subgraph.IsValid(min_subgraph_size)) { if (subgraph.IsValid(min_subgraph_size)) {
subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++)); subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
if (GenerateCode(&subgraph)) { if (GenerateCode(&subgraph)) {
...@@ -115,57 +110,52 @@ static int ExtractOpRole(fusion_group::SubGraph* subgraph) { ...@@ -115,57 +110,52 @@ static int ExtractOpRole(fusion_group::SubGraph* subgraph) {
void FusionGroupPass::InsertFusionGroupOp( void FusionGroupPass::InsertFusionGroupOp(
Graph* graph, fusion_group::SubGraph* subgraph) const { Graph* graph, fusion_group::SubGraph* subgraph) const {
const std::vector<Node*>& input_vars_of_subgraph = const std::vector<Node*>& input_vars = subgraph->GetInputVarNodes();
subgraph->GetInputVarNodes(); const std::vector<Node*>& output_vars =
const std::vector<Node*>& output_vars_of_subgraph = subgraph->GetOutputVarNodes(subgraph->SaveIntermediateOut());
subgraph->GetOutputVarNodes();
const std::vector<Node*> intermediate_vars_of_subgraph =
subgraph->GetIntermediateOutVarNodes();
std::unordered_set<Node*> external_nodes; std::unordered_set<Node*> external_nodes;
OpDesc op_desc; // Prepare inputs.
op_desc.SetType("fusion_group");
std::vector<std::string> input_names; std::vector<std::string> input_names;
std::vector<std::string> inputs_data_types; std::vector<int> input_dtypes;
for (auto* n : input_vars_of_subgraph) { std::unordered_set<Node*> output_vars_set(output_vars.begin(),
input_names.push_back(n->Name()); output_vars.end());
inputs_data_types.push_back(DataTypeToString(n->Var()->GetDataType())); for (auto* n : input_vars) {
external_nodes.insert(n); // It is not an output var node.
if (output_vars_set.find(n) == output_vars_set.end()) {
input_names.push_back(n->Name());
input_dtypes.push_back(n->Var()->GetDataType());
external_nodes.insert(n);
}
} }
op_desc.SetInput("Inputs", input_names);
// Prepare outputs.
std::vector<std::string> output_names; std::vector<std::string> output_names;
std::vector<std::string> outs_data_types; std::vector<int> output_dtypes;
std::vector<Node*> output_var_without_intermediate; for (auto* n : output_vars) {
for (auto* n : output_vars_of_subgraph) { output_names.push_back(n->Name());
auto it_input = output_dtypes.push_back(n->Var()->GetDataType());
find(input_vars_of_subgraph.begin(), input_vars_of_subgraph.end(), n);
auto it_intermediate = find(intermediate_vars_of_subgraph.begin(),
intermediate_vars_of_subgraph.end(), n);
if (it_intermediate == intermediate_vars_of_subgraph.end() &&
it_input == input_vars_of_subgraph.end()) {
output_names.push_back(n->Name());
outs_data_types.push_back(DataTypeToString(n->Var()->GetDataType()));
output_var_without_intermediate.push_back(n);
}
external_nodes.insert(n); external_nodes.insert(n);
} }
OpDesc op_desc;
op_desc.SetType("fusion_group");
op_desc.SetInput("Inputs", input_names);
op_desc.SetOutput("Outs", output_names); op_desc.SetOutput("Outs", output_names);
op_desc.SetAttr("inputs_data_type", inputs_data_types); op_desc.SetAttr("inputs_dtype", input_dtypes);
op_desc.SetAttr("outs_data_type", outs_data_types); op_desc.SetAttr("outs_dtype", output_dtypes);
op_desc.SetAttr("type", subgraph->GetType()); op_desc.SetAttr("type", subgraph->GetType());
op_desc.SetAttr("func_name", subgraph->GetFuncName()); op_desc.SetAttr("func_name", subgraph->GetFuncName());
op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
ExtractOpRole(subgraph)); ExtractOpRole(subgraph));
Node* fusion_group_node = graph->CreateOpNode(&op_desc); Node* fusion_group_node = graph->CreateOpNode(&op_desc);
for (auto* in : input_vars_of_subgraph) { for (auto* in : input_vars) {
IR_NODE_LINK_TO(in, fusion_group_node); if (output_vars_set.find(in) == output_vars_set.end()) {
IR_NODE_LINK_TO(in, fusion_group_node);
}
} }
for (auto* out : output_vars) {
for (auto* out : output_var_without_intermediate) {
IR_NODE_LINK_TO(fusion_group_node, out); IR_NODE_LINK_TO(fusion_group_node, out);
} }
......
...@@ -105,12 +105,6 @@ void OperationMap::InsertUnaryElementwiseOperations() { ...@@ -105,12 +105,6 @@ void OperationMap::InsertUnaryElementwiseOperations() {
insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}", insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}",
{"${2} * (%{1.0} - ${1} * ${1})"}); {"${2} * (%{1.0} - ${1} * ${1})"});
// cast:
// out = static_cast<T>(x)
// TODO(wangchaochaohu): This is not the compelete definition of
// cast Op, We need refine it later.
insert_handler("cast", "${0}", {});
// sqrt: // sqrt:
// out = x^(1/2) // out = x^(1/2)
// dx = dout * 0.5 / out // dx = dout * 0.5 / out
...@@ -121,11 +115,21 @@ void OperationMap::InsertUnaryElementwiseOperations() { ...@@ -121,11 +115,21 @@ void OperationMap::InsertUnaryElementwiseOperations() {
// dx = dout * 2.0 * x // dx = dout * 2.0 * x
insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"}); insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"});
// assign:
// out = x
insert_handler("assign", "${0}", {});
// cast:
// out = static_cast<T>(x)
// TODO(wangchaochaohu): This is not the compelete definition of
// cast Op, We need refine it later.
insert_handler("cast", "${0}", {});
// scale // scale
// out = (bias_after_scale) ? scale * X + bias : scale(X + bias) // out = (bias_after_scale) ? scale * X + bias : scale(X + bias)
// here we use '=' operator to seperate th default value // here we use '=' operator to seperate th default value
// TODO(wangchaochaohu): Later we need to support Tensor input for scale and // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
// bias. // bias.
insert_handler( insert_handler(
"scale", "scale",
"${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + " "${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + "
......
...@@ -66,11 +66,12 @@ class SubGraph { ...@@ -66,11 +66,12 @@ class SubGraph {
} }
int GetType() const { return type_; } int GetType() const { return type_; }
bool RemoveIntermediateOut() { return !save_intermediate_out_; }
void SetFuncName(std::string func_name) { func_name_ = func_name; } void SetFuncName(std::string func_name) { func_name_ = func_name; }
std::string GetFuncName() const { return func_name_; } std::string GetFuncName() const { return func_name_; }
bool SaveIntermediateOut() const { return save_intermediate_out_; }
const std::unordered_set<Node*>& Nodes() const { return nodes_set_; } const std::unordered_set<Node*>& Nodes() const { return nodes_set_; }
const std::vector<Node*>& SortedNodes() { const std::vector<Node*>& SortedNodes() {
if (!is_sorted_) { if (!is_sorted_) {
...@@ -118,66 +119,88 @@ class SubGraph { ...@@ -118,66 +119,88 @@ class SubGraph {
return input_vars; return input_vars;
} }
std::vector<Node*> GetOutputVarNodes() { std::vector<Node*> GetOutputVarNodes(bool with_intermediate_out) {
// The order of output nodes should be consistant anywhere.. // The order of output nodes should be consistant anywhere..
std::vector<Node*> output_vars_all; std::vector<Node*> output_vars;
for (auto* n : SortedNodes()) { for (auto* n : SortedNodes()) {
if (n && n->IsVar() && n->Var()) { if (IsOutputOfInternalOp(n)) {
// If the var_node is the output of some op_node in the subgraph, it // If the var_node is the output of some op_node in the subgraph, it
// is considered the output var node of the subgraph. // is considered the output var node of the subgraph.
bool is_found = false; if (with_intermediate_out) {
for (auto* in : n->inputs) { output_vars.push_back(n);
if (Has(in)) { } else {
is_found = true; if (n->outputs.empty() || IsInputOfExternalOp(n)) {
output_vars.push_back(n);
} }
} }
if (is_found) {
output_vars_all.push_back(n);
}
} }
} }
return output_vars_all; return output_vars;
} }
std::vector<Node*> GetIntermediateOutVarNodes() { std::vector<Node*> GetIntermediateOutVarNodes() {
return intermediate_out_nodes_; // Intermediate output var nodes: the output of some op_node in the
// subgraph, but not referenced outside the subgraph.
std::vector<Node*> intermediate_out_vars;
for (auto* n : SortedNodes()) {
if (IsOutputOfInternalOp(n) && IsInputOfInternalOp(n) &&
!IsInputOfExternalOp(n)) {
// When the outputs size is 0, it is also considered a intermidiate
// output. It maybe an unused output or the fetching vars, so that we
// cannot eleiminate it directly here.
intermediate_out_vars.push_back(n);
}
}
return intermediate_out_vars;
} }
void DetectIntermediateOutWithGraph(Graph* graph) { std::unordered_set<Node*> GetIntermediateOutVarNodesSet() {
auto graph_nodes = graph->Nodes(); std::vector<Node*> intermediate_out_vars = GetIntermediateOutVarNodes();
return std::unordered_set<Node*>(intermediate_out_vars.begin(),
for (auto* n : SortedNodes()) { intermediate_out_vars.end());
bool enable_remove = true; }
if (n && n->IsVar() && n->Var()) { private:
bool leaf_graph = true; bool IsInputOfInternalOp(Node* n) {
for (auto* node : graph_nodes) { bool is_input_of_internal_op = false;
if (node->IsOp()) { if (Has(n) && n && n->IsVar() && n->Var()) {
auto inputs = node->inputs; for (auto* out : n->outputs) {
for (auto* in : inputs) { if (Has(out)) {
if (in && in->Name() == n->Name()) { is_input_of_internal_op = true;
if (!Has(node)) enable_remove = false; break;
leaf_graph = false;
}
}
}
if (!enable_remove) {
break;
}
} }
if (leaf_graph) enable_remove = false; }
}
return is_input_of_internal_op;
}
} else { bool IsInputOfExternalOp(Node* n) {
enable_remove = false; // If n is the input any one node outside the subgraph.
bool is_input_of_external_op = false;
if (Has(n) && n && n->IsVar() && n->Var()) {
for (auto* out : n->outputs) {
if (!Has(out)) {
is_input_of_external_op = true;
break;
}
} }
}
return is_input_of_external_op;
}
if (enable_remove) { bool IsOutputOfInternalOp(Node* n) {
intermediate_out_nodes_.push_back(n); bool is_output_of_internal_op = false;
if (Has(n) && n && n->IsVar() && n->Var()) {
for (auto* in : n->inputs) {
if (Has(in)) {
is_output_of_internal_op = true;
break;
}
} }
} }
return is_output_of_internal_op;
} }
private:
void TopologicalSort() { void TopologicalSort() {
if (!is_sorted_) { if (!is_sorted_) {
std::unordered_map<Node*, std::vector<Node*>> inputs_map; std::unordered_map<Node*, std::vector<Node*>> inputs_map;
...@@ -236,7 +259,6 @@ class SubGraph { ...@@ -236,7 +259,6 @@ class SubGraph {
bool save_intermediate_out_{true}; bool save_intermediate_out_{true};
std::unordered_set<Node*> nodes_set_; std::unordered_set<Node*> nodes_set_;
std::vector<Node*> intermediate_out_nodes_{};
bool is_sorted_{false}; bool is_sorted_{false};
std::vector<Node*> sorted_nodes_; std::vector<Node*> sorted_nodes_;
}; };
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
...@@ -54,7 +55,7 @@ void LogQuantizationDisabled(Node* op) { ...@@ -54,7 +55,7 @@ void LogQuantizationDisabled(Node* op) {
std::stringstream msg_ss; std::stringstream msg_ss;
VLOG(4) << "Qantization skipped for operator " << op->Name() VLOG(4) << "Qantization skipped for operator " << op->Name()
<< " (type: " << op->Op()->Type() << ", id: " << op->id() << " (type: " << op->Op()->Type() << ", id: " << op->id()
<< "). Attribute use_quantizer = false."; << "). Attribute mkldnn_data_type != \"int8\".";
} }
} // namespace } // namespace
...@@ -228,12 +229,12 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node, ...@@ -228,12 +229,12 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node,
bool CPUQuantizePass::IsOpDequantized(const Node* node) const { bool CPUQuantizePass::IsOpDequantized(const Node* node) const {
return node->Op()->Type() == "dequantize" || return node->Op()->Type() == "dequantize" ||
node->Op()->GetAttrIfExists<bool>("use_quantizer"); platform::HasOpINT8DataType(node->Op());
} }
bool CPUQuantizePass::IsOpQuantized(const Node* node) const { bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
return node->Op()->Type() == "quantize" || return node->Op()->Type() == "quantize" ||
node->Op()->GetAttrIfExists<bool>("use_quantizer"); platform::HasOpINT8DataType(node->Op());
} }
void CPUQuantizePass::QuantizeConv(Graph* graph, void CPUQuantizePass::QuantizeConv(Graph* graph,
...@@ -248,10 +249,9 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ...@@ -248,10 +249,9 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
Graph* g) { Graph* g) {
VLOG(4) << "Quantize conv2d op"; VLOG(4) << "Quantize conv2d op";
GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
auto* conv_op_desc = conv_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!conv_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(conv_op->Op())) {
LogQuantizationDisabled(conv_op); LogQuantizationDisabled(conv_op);
return; return;
} }
...@@ -353,14 +353,13 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { ...@@ -353,14 +353,13 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
Graph* g) { Graph* g) {
VLOG(4) << "Quantize fc op"; VLOG(4) << "Quantize fc op";
GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
auto* fc_op_desc = fc->Op();
// skip if should not be quantized // skip if should not be quantized
if (!fc_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(fc->Op())) {
LogQuantizationDisabled(fc); LogQuantizationDisabled(fc);
return; return;
} }
if (!fc_op_desc->GetAttrIfExists<bool>("use_mkldnn")) { if (!fc->Op()->GetAttrIfExists<bool>("use_mkldnn")) {
return; return;
} }
...@@ -420,10 +419,9 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const { ...@@ -420,10 +419,9 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
Graph* g) { Graph* g) {
VLOG(4) << "Quantize pool2d op"; VLOG(4) << "Quantize pool2d op";
GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern); GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
auto* pool_op_desc = pool_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!pool_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(pool_op->Op())) {
LogQuantizationDisabled(pool_op); LogQuantizationDisabled(pool_op);
return; return;
} }
...@@ -465,10 +463,9 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const { ...@@ -465,10 +463,9 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
Graph* g) { Graph* g) {
VLOG(4) << "Quantize concat op"; VLOG(4) << "Quantize concat op";
GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_pattern); GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_pattern);
auto* concat_op_desc = concat_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!concat_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(concat_op->Op())) {
LogQuantizationDisabled(concat_op); LogQuantizationDisabled(concat_op);
return; return;
} }
...@@ -511,10 +508,9 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const { ...@@ -511,10 +508,9 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
Graph* g) { Graph* g) {
VLOG(4) << "Quantize prior_box op"; VLOG(4) << "Quantize prior_box op";
GET_IR_NODE_FROM_SUBGRAPH(prior_box_op, prior_box_op, prior_box_pattern); GET_IR_NODE_FROM_SUBGRAPH(prior_box_op, prior_box_op, prior_box_pattern);
auto* prior_box_op_desc = prior_box_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!prior_box_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(prior_box_op->Op())) {
LogQuantizationDisabled(prior_box_op); LogQuantizationDisabled(prior_box_op);
return; return;
} }
...@@ -554,10 +550,9 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const { ...@@ -554,10 +550,9 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
Graph* g) { Graph* g) {
VLOG(4) << "Quantize transpose op"; VLOG(4) << "Quantize transpose op";
GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, transpose_pattern);
auto* transpose_op_desc = transpose_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!transpose_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(transpose_op->Op())) {
LogQuantizationDisabled(transpose_op); LogQuantizationDisabled(transpose_op);
return; return;
} }
...@@ -609,10 +604,9 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { ...@@ -609,10 +604,9 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
Graph* g) { Graph* g) {
VLOG(4) << "Quantize reshape op"; VLOG(4) << "Quantize reshape op";
GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, reshape_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, reshape_pattern);
auto* reshape_op_desc = reshape_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!reshape_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(reshape_op->Op())) {
LogQuantizationDisabled(reshape_op); LogQuantizationDisabled(reshape_op);
return; return;
} }
...@@ -662,10 +656,9 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { ...@@ -662,10 +656,9 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
Graph* g) { Graph* g) {
VLOG(4) << "Quantize matmul op"; VLOG(4) << "Quantize matmul op";
GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
auto* matmul_op_desc = matmul_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!matmul_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(matmul_op->Op())) {
LogQuantizationDisabled(matmul_op); LogQuantizationDisabled(matmul_op);
return; return;
} }
...@@ -732,10 +725,9 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { ...@@ -732,10 +725,9 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
VLOG(4) << "Quantize elementwise_add op"; VLOG(4) << "Quantize elementwise_add op";
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern); elementwise_add_pattern);
auto* elementwise_add_op_desc = elementwise_add_op->Op();
// skip if should not be quantized // skip if should not be quantized
if (!elementwise_add_op_desc->GetAttrIfExists<bool>("use_quantizer")) { if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
LogQuantizationDisabled(elementwise_add_op); LogQuantizationDisabled(elementwise_add_op);
return; return;
} }
......
...@@ -26,7 +26,7 @@ namespace ir { ...@@ -26,7 +26,7 @@ namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs, const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn, const std::vector<std::string>& outputs, bool use_mkldnn,
bool use_quantizer = false) { const std::string& mkldnn_data_type = "float32") {
auto* op = prog->MutableBlock(0)->AppendOp(); auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type); op->SetType(type);
op->SetAttr("use_mkldnn", use_mkldnn); op->SetAttr("use_mkldnn", use_mkldnn);
...@@ -47,14 +47,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, ...@@ -47,14 +47,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetAttr("fuse_residual_connection", false); op->SetAttr("fuse_residual_connection", false);
} }
op->SetOutput("Output", {outputs[0]}); op->SetOutput("Output", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer); op->SetAttr("mkldnn_data_type", mkldnn_data_type);
op->SetAttr("Scale_in", 1.0f); op->SetAttr("Scale_in", 1.0f);
op->SetAttr("Scale_out", 1.0f); op->SetAttr("Scale_out", 1.0f);
op->SetAttr("Scale_weights", std::vector<float>{1.0f}); op->SetAttr("Scale_weights", std::vector<float>{1.0f});
} else if (type == "pool2d" || type == "transpose2" || type == "reshape2") { } else if (type == "pool2d" || type == "transpose2" || type == "reshape2") {
op->SetInput("X", {inputs[0]}); op->SetInput("X", {inputs[0]});
op->SetOutput("Out", {outputs[0]}); op->SetOutput("Out", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer); op->SetAttr("mkldnn_data_type", mkldnn_data_type);
} else if (type == "dropout") { } else if (type == "dropout") {
op->SetInput("X", {inputs[0]}); op->SetInput("X", {inputs[0]});
op->SetOutput("Out", {outputs[0]}); op->SetOutput("Out", {outputs[0]});
...@@ -63,14 +63,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, ...@@ -63,14 +63,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
if (inputs.size() > 1) op->SetInput("W", {inputs[1]}); if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
op->SetOutput("Out", {outputs[0]}); op->SetOutput("Out", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer); op->SetAttr("mkldnn_data_type", mkldnn_data_type);
op->SetAttr("Scale_in", 1.0f); op->SetAttr("Scale_in", 1.0f);
op->SetAttr("Scale_out", 1.0f); op->SetAttr("Scale_out", 1.0f);
op->SetAttr("Scale_weights", std::vector<float>{1.0f}); op->SetAttr("Scale_weights", std::vector<float>{1.0f});
} else if (type == "concat") { } else if (type == "concat") {
op->SetInput("X", inputs); op->SetInput("X", inputs);
op->SetOutput("Out", outputs); op->SetOutput("Out", outputs);
op->SetAttr("use_quantizer", use_quantizer); op->SetAttr("mkldnn_data_type", mkldnn_data_type);
} else if (type == "dequantize") { } else if (type == "dequantize") {
op->SetInput("Input", {inputs[0]}); op->SetInput("Input", {inputs[0]});
op->SetOutput("Output", {outputs[0]}); op->SetOutput("Output", {outputs[0]});
...@@ -79,7 +79,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, ...@@ -79,7 +79,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetInput("X", {inputs[0]}); op->SetInput("X", {inputs[0]});
if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
op->SetOutput("Out", {outputs[0]}); op->SetOutput("Out", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer); op->SetAttr("mkldnn_data_type", mkldnn_data_type);
op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_x", 1.0f);
op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_y", 1.0f);
op->SetAttr("Scale_out", 1.0f); op->SetAttr("Scale_out", 1.0f);
...@@ -87,7 +87,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, ...@@ -87,7 +87,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetInput("X", {inputs[0]}); op->SetInput("X", {inputs[0]});
if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
op->SetOutput("Out", {outputs[0]}); op->SetOutput("Out", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer); op->SetAttr("mkldnn_data_type", mkldnn_data_type);
op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_x", 1.0f);
op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_y", 1.0f);
op->SetAttr("Scale_out", 1.0f); op->SetAttr("Scale_out", 1.0f);
...@@ -142,7 +142,8 @@ static const std::initializer_list<std::string> variable_names{ ...@@ -142,7 +142,8 @@ static const std::initializer_list<std::string> variable_names{
// d->Dropout1->g and (g, w5, b3)->Fc1->h and (h,w3,b1,i)->Conv3->j // d->Dropout1->g and (g, w5, b3)->Fc1->h and (h,w3,b1,i)->Conv3->j
// //
// (d,w4, b2)->Conv4->i // (d,w4, b2)->Conv4->i
ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) { ProgramDesc BuildProgramDesc(bool use_mkldnn,
const std::string& mkldnn_data_type) {
ProgramDesc prog; ProgramDesc prog;
for (auto& v : variable_names) { for (auto& v : variable_names) {
auto* var = prog.MutableBlock(0)->Var(v); auto* var = prog.MutableBlock(0)->Var(v);
...@@ -152,21 +153,21 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) { ...@@ -152,21 +153,21 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
} }
SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn, SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
use_quantizer); mkldnn_data_type);
SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer); SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, mkldnn_data_type);
SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn, SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
use_quantizer); mkldnn_data_type);
SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer); SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, mkldnn_data_type);
SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn); SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
SetOp(&prog, "fc", "Fc1", {"g", "w5", "b3"}, {"h"}, use_mkldnn, SetOp(&prog, "fc", "Fc1", {"g", "w5", "b3"}, {"h"}, use_mkldnn,
use_quantizer); mkldnn_data_type);
SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn, SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
use_quantizer); mkldnn_data_type);
SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn, SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
use_quantizer); mkldnn_data_type);
return prog; return prog;
} }
...@@ -215,7 +216,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, ...@@ -215,7 +216,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
TEST(CpuQuantizePass, quantize) { TEST(CpuQuantizePass, quantize) {
bool use_mkldnn = true; bool use_mkldnn = true;
bool use_quantizer = true; std::string mkldnn_data_type = "int8";
// (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
// c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
// //
...@@ -228,16 +229,16 @@ TEST(CpuQuantizePass, quantize) { ...@@ -228,16 +229,16 @@ TEST(CpuQuantizePass, quantize) {
// (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
// Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT // Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT
int added_nodes = 8 + 8 + 7 + 7; int added_nodes = 8 + 8 + 7 + 7;
MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 8, 7, added_nodes, MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 8, 7,
2.0f * 127); added_nodes, 2.0f * 127);
} }
TEST(CpuQuantizePass, do_not_quantize) { TEST(CpuQuantizePass, do_not_quantize) {
bool use_mkldnn = true; bool use_mkldnn = true;
bool use_quantizer = false; std::string mkldnn_data_type = "float32";
int added_nodes = 0; int added_nodes = 0;
MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes, MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 0, 0,
1.0f); added_nodes, 1.0f);
} }
static const std::initializer_list<std::string> variable_names_concat = { static const std::initializer_list<std::string> variable_names_concat = {
...@@ -250,10 +251,10 @@ static const std::initializer_list<std::string> variable_names_concat = { ...@@ -250,10 +251,10 @@ static const std::initializer_list<std::string> variable_names_concat = {
ProgramDesc BuildProgramDescConcat() { ProgramDesc BuildProgramDescConcat() {
ProgramDesc prog; ProgramDesc prog;
SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, false); SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, "float32");
SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, false); SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, "float32");
SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, true); SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, "int8");
SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, false); SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, "float32");
return prog; return prog;
} }
...@@ -321,11 +322,11 @@ ProgramDesc BuildProgramDescTranspose() { ...@@ -321,11 +322,11 @@ ProgramDesc BuildProgramDescTranspose() {
} }
} }
SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, true); SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, "int8");
SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, true); SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, "int8");
SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, true); SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, "int8");
SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, true); SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, "int8");
SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
return prog; return prog;
} }
...@@ -400,8 +401,8 @@ ProgramDesc BuildProgramDescReshape() { ...@@ -400,8 +401,8 @@ ProgramDesc BuildProgramDescReshape() {
prog.MutableBlock(0)->Var(v); prog.MutableBlock(0)->Var(v);
} }
SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true); SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false); SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
return prog; return prog;
} }
...@@ -415,9 +416,9 @@ ProgramDesc BuildProgramDescReshapeBetweenNonQuantizedOp() { ...@@ -415,9 +416,9 @@ ProgramDesc BuildProgramDescReshapeBetweenNonQuantizedOp() {
prog.MutableBlock(0)->Var(v); prog.MutableBlock(0)->Var(v);
} }
SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, false); SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32");
SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true); SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false); SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
return prog; return prog;
} }
...@@ -505,8 +506,8 @@ ProgramDesc BuildProgramDescMatmul() { ...@@ -505,8 +506,8 @@ ProgramDesc BuildProgramDescMatmul() {
} }
SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true); SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
return prog; return prog;
} }
...@@ -518,8 +519,8 @@ ProgramDesc BuildProgramDescMatmulNotQuantized() { ...@@ -518,8 +519,8 @@ ProgramDesc BuildProgramDescMatmulNotQuantized() {
} }
SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false); SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true); SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true); SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
return prog; return prog;
} }
...@@ -590,8 +591,8 @@ ProgramDesc BuildProgramDescElementwiseAdd() { ...@@ -590,8 +591,8 @@ ProgramDesc BuildProgramDescElementwiseAdd() {
SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true, SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
true); "int8");
SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
return prog; return prog;
} }
......
...@@ -32,11 +32,16 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { ...@@ -32,11 +32,16 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
n->id()) != excluded_ids_list.end()) n->id()) != excluded_ids_list.end())
continue; continue;
auto* op = n->Op(); auto* op = n->Op();
if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) { if (op->HasAttr("mkldnn_data_type") ||
if (op_types_list.empty()) { op->HasProtoAttr("mkldnn_data_type")) {
op->SetAttr("use_quantizer", true); // use_quantizer is no longer used
} else if (std::find(op_types_list.begin(), op_types_list.end(), // assign value for compatibility
op->Type()) != op_types_list.end()) { if (op->GetAttrIfExists<bool>("use_quantizer")) {
op->SetAttr("mkldnn_data_type", std::string("int8"));
}
if (std::find(op_types_list.begin(), op_types_list.end(), op->Type()) !=
op_types_list.end()) {
op->SetAttr("mkldnn_data_type", std::string("int8"));
op->SetAttr("use_quantizer", true); op->SetAttr("use_quantizer", true);
} }
} }
...@@ -53,7 +58,10 @@ REGISTER_PASS(cpu_quantize_placement_pass, ...@@ -53,7 +58,10 @@ REGISTER_PASS(cpu_quantize_placement_pass,
// a vector of operator type names to be quantized ("conv2d" etc.) // a vector of operator type names to be quantized ("conv2d" etc.)
// the second param is the default value for this vector // the second param is the default value for this vector
.DefaultPassAttr("quantize_enabled_op_types", .DefaultPassAttr("quantize_enabled_op_types",
new std::unordered_set<std::string>()) new std::unordered_set<std::string>(
{"concat", "conv2d", "elementwise_add", "fc", "matmul",
"pool2d", "prior_box", "relu", "reshape2",
"transpose2"}))
// a vector of operator ids that are to be excluded from quantization // a vector of operator ids that are to be excluded from quantization
// the second param is the default value for this vector // the second param is the default value for this vector
.DefaultPassAttr("quantize_excluded_op_ids", new std::unordered_set<int>()); .DefaultPassAttr("quantize_excluded_op_ids", new std::unordered_set<int>());
...@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() { ...@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
BriefNode *brief_node = itr.second; BriefNode *brief_node = itr.second;
if (!Agent(brief_node->node).marked()) { if (!Agent(brief_node->node).marked()) {
VLOG(4) << brief_node->node->id() << " node not a trt candidate."; VLOG(4) << brief_node->node->id() << " node named "
<< brief_node->node->Name() << " is not a trt candidate.";
continue; continue;
} }
......
...@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { ...@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
// CPU, CUDA, PLAIN are same library type. // CPU, CUDA, PLAIN are same library type.
} else if (s == std::string("CPU")) { } else if (s == std::string("CPU")) {
return LibraryType::kPlain; return LibraryType::kPlain;
} else if (s == std::string("XPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) { } else if (s == std::string("CUDA")) {
return LibraryType::kPlain; return LibraryType::kPlain;
} else { } else {
......
...@@ -102,6 +102,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, ...@@ -102,6 +102,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
workers_[i]->SetRootScope(root_scope_); workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory(); workers_[i]->BindingDataFeedMemory();
workers_[i]->CacheProgram(main_program);
} }
} }
......
此差异已折叠。
...@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \ customized_name, \
customized_type_value, \ customized_type_value, \
...@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/** /**
* Macro to mark what Operator and Kernel * Macro to mark what Operator and Kernel
* we will use and tell the compiler to * we will use and tell the compiler to
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -64,9 +64,6 @@ constexpr char kZeroVarSuffix[] = "@ZERO"; ...@@ -64,9 +64,6 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
/// Variables with this suffix are the new Gradient. /// Variables with this suffix are the new Gradient.
constexpr char kNewGradSuffix[] = "@NEWGRAD@"; constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// Variables with this suffix are the loaded from pre-train model.
constexpr char kLoadedVarSuffix[] = "@LOADED";
/// RuntimeContext is used to relate input/output names of Operator with /// RuntimeContext is used to relate input/output names of Operator with
/// the corresponding variables in name scope. /// the corresponding variables in name scope.
/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same /// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
......
...@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy, const BuildStrategy &build_strategy,
ir::Graph *graph) ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) { : member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
platform::errors::Unavailable(
"XPU is not supported in ParallelExecutor"));
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size()); member_->places_.size());
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册