提交 fe3d90ba 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_slice_to_pten

......@@ -6,12 +6,14 @@ paddle/fluid/eager/api/generated/*
paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/phi/api/backward/backward_api.h
paddle/phi/api/backward/sparse_bw_api.h
paddle/phi/api/include/api.h
paddle/phi/api/include/sparse_api.h
paddle/phi/api/lib/api.cc
paddle/phi/api/lib/dygraph_api.*
paddle/phi/api/lib/backward_api.cc
paddle/phi/api/lib/sparse_api.cc
paddle/phi/api/lib/sparse_bw_api.cc
paddle/phi/extension.h
paddle/phi/include/*
paddle/phi/infermeta/generated.*
......@@ -54,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td
paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
tools/infrt/kernels.json
tools/infrt/kernel_signature.json
paddle/infrt/dialect/pd_ops_info.h
.lit_test_times.txt
paddle/infrt/tests/dialect/Output
......
......@@ -53,6 +53,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
# Note(zhouwei): It use option above, so put here
include(init)
include(generic) # simplify cmake module
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if (NOT WITH_ONNXRUNTIME)
return()
endif ()
if (WITH_ARM)
message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
return()
endif ()
INCLUDE(ExternalProject)
add_definitions(-DPADDLE_WITH_ONNXRUNTIME)
SET(ONNXRUNTIME_PROJECT "extern_onnxruntime")
SET(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime)
SET(ONNXRUNTIME_SOURCE_DIR ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT})
SET(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime)
SET(ONNXRUNTIME_INC_DIR "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE)
SET(ONNXRUNTIME_LIB_DIR "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE)
SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
if (WIN32)
SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip")
elseif (APPLE)
SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz")
else ()
SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz")
endif()
INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers.
if (WIN32)
SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
elseif (APPLE)
SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
else ()
SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
endif ()
if (WIN32)
ExternalProject_Add(
${ONNXRUNTIME_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${ONNXRUNTIME_URL}
PREFIX ${ONNXRUNTIME_PREFIX_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} &&
${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} &&
${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB}
)
else ()
ExternalProject_Add(
${ONNXRUNTIME_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${ONNXRUNTIME_URL}
PREFIX ${ONNXRUNTIME_PREFIX_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} &&
${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB}
)
endif()
ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT})
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_ONNXRUNTIME)
return()
endif()
if (WITH_ARM)
message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
return()
endif ()
INCLUDE(ExternalProject)
SET(PADDLE2ONNX_PROJECT "extern_paddle2onnx")
SET(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx)
SET(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx)
SET(PADDLE2ONNX_INC_DIR "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE)
SET(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
SET(PADDLE2ONNX_TAG cpp)
SET(LIBDIR "lib")
SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers.
if(WIN32)
SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE)
SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE)
elseif(APPLE)
SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
else()
SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
endif(WIN32)
# The protoc path is required to compile onnx.
string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
list(POP_BACK PROTOC_BIN_PATH)
list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
set(PADDLE2ONNX_OPTIONAL_ARGS
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
-DWITH_STATIC=OFF
-DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
)
if (WITH_PYTHON)
set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
-DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
-DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
-DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY}
)
endif ()
ExternalProject_Add(
${PADDLE2ONNX_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY}
GIT_TAG ${PADDLE2ONNX_TAG}
DEPENDS protobuf
PREFIX ${PADDLE2ONNX_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB}
)
ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT})
......@@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
"-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
ENDIF()
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
if(WITH_ONNXRUNTIME)
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
SET(PROTOBUF_TAG v3.18.0)
elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
......@@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
if(WITH_ASCEND OR WITH_ASCEND_CL)
if(WITH_ONNXRUNTIME)
SET(PROTOBUF_VERSION 3.18.0)
elseif(WITH_ASCEND OR WITH_ASCEND_CL)
SET(PROTOBUF_VERSION 3.8.0)
elseif(WITH_IPU)
SET(PROTOBUF_VERSION 3.6.1)
......
......@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
......
......@@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST)
endif()
endif()
if (WITH_ONNXRUNTIME)
set(dst_dir "${DST}/third_party/install/onnxruntime")
copy(${TARGET}
SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR}
DSTS ${dst_dir} ${dst_dir})
set(dst_dir "${DST}/third_party/install/paddle2onnx")
if(WIN32)
copy(${TARGET}
SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB}
DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
else()
copy(${TARGET}
SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
DSTS ${dst_dir}/include ${dst_dir}/lib)
endif()
endif()
set(dst_dir "${DST}/third_party/install/gflags")
copy(${TARGET}
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
......
......@@ -478,7 +478,7 @@ function(op_library TARGET)
if (${pybind_flag} EQUAL 0)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP(relu);\n")
file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n")
elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "fake_quantize")
......
......@@ -134,8 +134,8 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
endif()
endif()
if (WITH_XPU)
......@@ -197,92 +197,88 @@ function(kernel_library TARGET)
# kernel source file level
# level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
# - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
# level 2: device-independent kernel
# - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
set(base_device_kernels)
set(device_independent_kernel)
set(high_level_kernels)
# Build Target according different src organization
if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
(${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
# If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
# 1. Base device kernel compile
if (${cpu_srcs_len} GREATER 0)
cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_cpu)
endif()
if (${gpu_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
list(APPEND base_device_kernels ${TARGET}_gpu)
endif()
if (${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_xpu)
endif()
# If there are only specific device srcs, build target using this rule.
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
if (${gpudnn_srcs_len} GREATER 0)
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_gpudnn)
endif()
if (${kps_srcs_len} GREATER 0)
# only when WITH_XPU_KP, the kps_srcs_len can be > 0
xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_kps)
endif()
# If the selected_rows_srcs depends on common_srcs, build target using this rule.
elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
# 2. Device-independent kernel compile
if (${common_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_ROCM)
hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
else()
cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
endif()
# If there are only common_srcs or selected_rows_srcs, build target using below rules.
elseif (${common_srcs_len} GREATER 0)
list(APPEND device_independent_kernel ${TARGET}_common)
endif()
# 3. Reusing kernel compile
if (${selected_rows_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif()
elseif (${selected_rows_srcs_len} GREATER 0)
list(APPEND high_level_kernels ${TARGET}_sr)
endif()
# 4. Unify target compile
list(LENGTH base_device_kernels base_device_kernels_len)
list(LENGTH device_independent_kernel device_independent_kernel_len)
list(LENGTH high_level_kernels high_level_kernels_len)
if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
${high_level_kernels_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
else()
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
endif()
else()
set(target_build_flag 0)
......
......@@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE)
list(APPEND third_party_deps extern_gtest)
ENDIF()
if(WITH_ONNXRUNTIME)
include(external/onnxruntime) # download, build, install onnxruntime、paddle2onnx
include(external/paddle2onnx)
list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx)
endif()
if(WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
include(external/cub) # download cub
......
......@@ -7,3 +7,6 @@ cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL)
cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
endif()
if(WITH_ASCEND_CL)
cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
endif()
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <string>
#include "boost/variant.hpp"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
class NPUEventManager {
public:
NPUEventManager() = default;
~NPUEventManager() {
if (is_created_) {
platform::NPUDeviceGuard guard(device_index_);
platform::NPUEventDestroy(event_);
}
}
NPUEventManager(const NPUEventManager&) = delete;
NPUEventManager& operator=(const NPUEventManager&) = delete;
NPUEventManager(NPUEventManager&& other) {
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
}
NPUEventManager& operator=(NPUEventManager&& other) {
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
return *this;
}
bool IsCreated() const { return is_created_; }
bool DeviceId() const { return device_index_; }
aclrtEvent GetRawNPUEvent() const { return event_; }
void Record(const paddle::platform::NPUDeviceContext& ctx) {
auto device_index = ctx.GetPlace().device;
if (!is_created_) {
CreateEvent(device_index);
}
PADDLE_ENFORCE_EQ(device_index, device_index_,
platform::errors::PreconditionNotMet(
"NPUDeviceContext's device %d does not match"
"Event's device %d",
device_index, device_index_));
platform::NPUDeviceGuard guard(device_index_);
platform::NPUEventRecord(event_, ctx.stream());
}
bool Query() const {
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
platform::NPUEventQuery(event_, &status);
if (status == ACL_EVENT_STATUS_COMPLETE) {
return true;
}
return false;
}
void Block(const paddle::platform::NPUDeviceContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index, device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"Event's device %d",
device_index, device_index_));
platform::NPUDeviceGuard guard(device_index_);
platform::NPUStreamWaitEvent(ctx.stream(), event_);
}
}
private:
bool is_created_{false};
aclrtEvent event_{};
int8_t device_index_{0};
private:
void CreateEvent(int device_index) {
device_index_ = device_index;
platform::NPUDeviceGuard guard(device_index);
platform::NPUEventCreate(&event_);
is_created_ = true;
}
};
class HCCLCommManager {
public:
explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
HCCLCommManager() : HCCLCommManager(nullptr) {}
~HCCLCommManager() noexcept {
std::unique_lock<std::mutex> lock(mutex_);
if (hccl_comm_) {
platform::dynload::HcclCommDestroy(hccl_comm_);
}
}
static std::shared_ptr<HCCLCommManager> Create(int num_ranks, int rank,
HcclRootInfo* comm_id,
HcclComm hccl_comm) {
auto hccl_manager = std::make_shared<HCCLCommManager>();
auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank,
&hccl_comm);
using __NPU_STATUS_TYPE__ = decltype(ret);
constexpr auto __success_type__ =
platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
if (UNLIKELY(ret != __success_type__)) {
VLOG(0) << "Error: create hccl_id error.";
exit(-1);
}
hccl_manager->hccl_id_ = comm_id;
hccl_manager->rank_ = rank;
hccl_manager->hccl_comm_ = hccl_comm;
return hccl_manager;
}
HcclRootInfo* GetHcclId() const {
std::unique_lock<std::mutex> lock(mutex_);
return hccl_id_;
}
HcclComm GetHcclComm() const {
std::unique_lock<std::mutex> lock(mutex_);
return hccl_comm_;
}
HCCLCommManager(const HCCLCommManager&) = delete;
HCCLCommManager& operator=(const HCCLCommManager&) = delete;
HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
HCCLCommManager(HCCLCommManager&& other) {
std::unique_lock<std::mutex> lock(other.mutex_);
std::swap(hccl_comm_, other.hccl_comm_);
}
protected:
HcclComm hccl_comm_;
HcclRootInfo* hccl_id_;
int rank_;
mutable std::mutex mutex_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/npu/hccl_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
DECLARE_bool(hccl_blocking_wait);
// DECLARE_bool(use_stream_safe_npu_allocator);
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
static HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
static const std::map<ReduceOp, HcclReduceOp> red_type = {
{ReduceOp::MIN, HCCL_REDUCE_MIN},
{ReduceOp::MAX, HCCL_REDUCE_MAX},
{ReduceOp::SUM, HCCL_REDUCE_SUM},
{ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
};
auto it = red_type.find(reduction);
PADDLE_ENFORCE_EQ(
it != red_type.end(), true,
platform::errors::InvalidArgument("Invalid hccl reduction. "
"Must be Min | Max | Prod | Sum"));
return it->second;
}
std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
std::ostringstream oss;
for (size_t i = 0; i < sizeof(hcclID); ++i) {
oss << std::hex << static_cast<int>(bytes[i]);
}
return oss.str();
}
// Get the list of devices from list of tensors
std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
std::vector<Place> places;
places.reserve(tensors.size());
for (auto& tensor : tensors) {
places.push_back(tensor.inner_place());
}
return places;
}
// Get the deviceList String from the list of devices
std::string GetKeyFromPlaces(const std::vector<Place>& places) {
std::string placeList;
for (auto& place : places) {
std::stringstream tmp;
tmp << place;
if (placeList.empty()) {
placeList += tmp.str();
} else {
placeList += "," + tmp.str();
}
}
return placeList;
}
// bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
// return t.place() == platform::DeviceType::NPU;
// });
// }
void SyncDefaultStream(
const std::vector<Place>& places,
std::vector<NPUEventManager>& hcclEvents, // NOLINT
std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(places[i]));
hcclEvents[i].Record(*dev_ctx[i]);
hcclEvents[i].Block(*default_ctx);
}
}
std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
std::vector<Place> places, int rank, CommType comm_type,
const std::vector<Tensor>& inputs) {
return std::make_shared<ProcessGroupHCCL::HCCLTask>(places, rank, comm_type,
inputs);
}
ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector<Place>& places, int rank,
CommType CommType,
const std::vector<Tensor>& inputs)
: Task(rank, inputs, CommType), places_(places) {
control_events_.resize(places.size());
hcclComms_.resize(places.size());
}
ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
void ProcessGroupHCCL::HCCLTask::SetOutputs(
std::vector<Tensor>& outputs) { // NOLINT
outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
}
void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(places_[i]));
platform::NPUStreamWaitEvent(default_ctx->stream(),
control_events_[i].GetRawNPUEvent());
}
}
bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
for (size_t i = 0; i < places_.size(); ++i) {
if (!control_events_[i].Query()) {
return false;
}
}
return true;
}
// TODO(sandyhouse): Add timeout for wait, now timeout unused
bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
SynchronizeStreams();
if (FLAGS_hccl_blocking_wait) {
// NOTE(sandyhouse): It will block host for sync
while (!IsCompleted()) {
std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
}
}
return true;
}
// Same as Wait
void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
int rank, int size)
: ProcessGroup(rank, size), store_(store) {}
void ProcessGroupHCCL::BroadcastUniqueHCCLID(
std::vector<HcclRootInfo>& hccl_ids) { // NOLINT
if (rank_ == 0) {
for (size_t i = 0; i < hccl_ids.size(); i++) {
auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
auto hccl_id = std::vector<uint8_t>(
reinterpret_cast<uint8_t*>(&hccl_ids[i]),
reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
store_->set(key, hccl_id);
}
} else {
for (size_t i = 0; i < hccl_ids.size(); i++) {
auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
auto ret = store_->get(key);
std::memcpy(&hccl_ids[i], ret.data(), ret.size());
}
}
}
// create HCCLManager cache for places_key
void ProcessGroupHCCL::CreateHCCLManagerCache(
const std::string& places_key, const std::vector<Place>& places) {
PADDLE_ENFORCE_EQ(places_key.empty(), false,
platform::errors::PreconditionNotMet(
"Not able to create/get the HCCL Communicator since "
"the NPU place are not known"));
std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
hccl_comms.resize(places.size());
// using vector just for broadcast
std::vector<HcclRootInfo> hccl_ids;
hccl_ids.resize(1);
auto& hccl_id = hccl_ids.front();
if (rank_ == 0) {
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
}
BroadcastUniqueHCCLID(hccl_ids);
VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
<< ", place: " << places_key
<< ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
dev_ctx.resize(places.size());
std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
for (size_t i = 0; i < places.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id,
comms.get() + i);
dev_ctx[i].reset(new NPUDeviceContext(places[i]));
}
std::vector<NPUEventManager> events;
events.resize(places.size());
// These caches will be useful to process sync/wait/communicate
places_to_events_.emplace(places_key, std::move(events));
places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
places_to_ctx_.emplace(places_key, std::move(dev_ctx));
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
CommType op_type) {
const auto places = GetPlaceList(inputs);
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
CreateHCCLManagerCache(key, places);
}
}
auto& hccl_comms = places_to_hcclcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
auto task = CreateTask(places, rank_, op_type, inputs);
task->SetOutputs(outputs);
// if (FLAGS_use_stream_safe_npu_allocator) {
// for (size_t i = 0; i < inputs.size(); ++i) {
// platform::NPUDeviceGuard guard(places[i].GetDeviceId());
// auto dense_tensor =
// std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
// memory::RecordStream(dense_tensor->Holder(),
// places_to_ctx_[key][i]->stream());
// }
// }
for (size_t i = 0; i < inputs.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
const auto& hccl_stream = places_to_ctx_[key][i]->stream();
fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
}
for (size_t i = 0; i < inputs.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
return task;
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::PointToPoint(
std::vector<Tensor>& tensors, Fn fn, int dst_rank, CommType op_type) {
const auto places = GetPlaceList(tensors);
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
CreateHCCLManagerCache(key, places);
}
}
auto& hccl_comms = places_to_hcclcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
auto task = CreateTask(places, rank_, op_type, tensors);
// construct uninitialize guard for device
// if (FLAGS_use_stream_safe_npu_allocator) {
// for (size_t i = 0; i < tensors.size(); ++i) {
// platform::NPUDeviceGuard guard(places[i].GetDeviceId());
// auto dense_tensor =
// std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
// memory::RecordStream(dense_tensor->Holder(),
// places_to_ctx_[key][i]->stream());
// }
// }
for (size_t i = 0; i < tensors.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
const auto& hccl_stream = places_to_ctx_[key][i]->stream();
fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank);
}
for (size_t i = 0; i < tensors.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
// PADDLE_ENFORCE_EQ(
// CheckTensorsInNPUPlace(tensors), true,
// platform::errors::InvalidArgument("All inputs should be in
// NPUPlace."));
return Collective(
tensors, tensors,
[&](const Tensor& input, Tensor& output, HcclComm comm,
const aclrtStream& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
return platform::dynload::HcclAllReduce(
input_tensor->data(), output_tensor->data(), input_tensor->numel(),
platform::ToHCCLDataType(input.type()),
ToHCCLRedType(opts.reduce_op), comm, stream);
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
// PADDLE_ENFORCE_EQ(
// CheckTensorsInNPUPlace(tensors), true,
// platform::errors::InvalidArgument("All inputs should be in
// CudaPlace."));
return Collective(
tensors, tensors,
[&](Tensor& input, Tensor& output, HcclComm comm,
const aclrtStream& stream) {
const auto root = opts.source_rank * tensors.size() + opts.source_root;
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
return platform::dynload::HcclBroadcast(
input_tensor->data(), input_tensor->numel(),
platform::ToHCCLDataType(input.type()), root, comm, stream);
},
CommType::BROADCAST);
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/device/npu/npu_stream.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
constexpr const char* HCCL_BACKEND_NAME = "HCCL";
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
using NPUStream = platform::stream::NPUStream;
using NPUDeviceContext = paddle::platform::NPUDeviceContext;
class ProcessGroupHCCL : public ProcessGroup {
public:
class HCCLTask : public ProcessGroup::Task,
public std::enable_shared_from_this<HCCLTask> {
public:
HCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
const std::vector<Tensor>& inputs);
bool IsCompleted();
void SynchronizeStreams();
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void Synchronize();
void SetOutputs(std::vector<Tensor>& outputs); // NOLINT
virtual ~HCCLTask();
std::vector<NPUEventManager> control_events_;
protected:
std::vector<Place> places_;
std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
std::shared_ptr<std::vector<Tensor>> outputs_;
private:
};
ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size);
const std::string GetBackendName() const override {
return std::string(HCCL_BACKEND_NAME);
}
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<Tensor>& tensors,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<Tensor>& tensors,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override;
std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
int dst_rank) override;
std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
int src_rank) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<Tensor>& in, std::vector<Tensor>& out) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors,
const ScatterOptions&) override;
protected:
virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
std::vector<Place> places, int rank, CommType opType,
const std::vector<Tensor>& inputs);
std::shared_ptr<Store> store_;
std::shared_ptr<HCCLCommManager> hccl_comm_;
std::mutex mutex_;
std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
places_to_hcclcomm_;
std::unordered_map<std::string, std::vector<NPUEventManager>>
places_to_events_;
std::unordered_map<std::string,
std::vector<std::unique_ptr<NPUDeviceContext>>>
places_to_ctx_;
std::set<int> used_place_ids_;
private:
void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids, int root, // NOLINT
int server_fd);
void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids); // NOLINT
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> Collective(
std::vector<Tensor>& inputs, // NOLINT
std::vector<Tensor>& outputs, // NOLINT
Fn fn, CommType op_type);
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> PointToPoint(
std::vector<Tensor>& tensors, // NOLINT
Fn fn, int dst_rank, CommType op_type);
void CreateHCCLManagerCache(const std::string& places_key,
const std::vector<Place>& places);
};
} // namespace distributed
} // namespace paddle
......@@ -24,10 +24,14 @@ limitations under the License. */
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/phi/core/kernel_registry.h"
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(fill_constant);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace paddle {
namespace distributed {
......
......@@ -136,10 +136,6 @@ void MasterDaemon::run() {
}
for (size_t i = 1; i < fds.size(); i++) {
VLOG(0) << "fds.size:" << fds.size();
VLOG(0) << "fds.size-i:" << i;
VLOG(0) << "fds[i].revents:" << fds[i].revents;
try {
if (fds[i].revents == 0) {
continue;
......
set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
set(generated_deps dygraph_function dygraph_node)
......@@ -10,11 +10,11 @@ endif()
add_subdirectory(api)
add_subdirectory(accumulation)
cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
add_subdirectory(tests)
......@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
static std::unordered_map<std::string, paddle::framework::AttributeMap>
operators_with_attrs = {};
/* --- Black Ops list that's NO NEED to apply code generation --- */
static std::unordered_set<std::string> black_ops_list = {"run_program"};
static std::string LegalizeVariableName(const std::string& var_name) {
std::string ret = var_name;
std::replace(ret.begin(), ret.end(), '-', '_'); // replace all '-' to '_'
......@@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type,
}
static void PrepareAttrMapForOps() {
// Handle "run_program_op"
static framework::ProgramDesc fake_prog;
operators_with_attrs["run_program"] = {};
operators_with_attrs["run_program"]["global_block"] =
fake_prog.MutableBlock(0);
// Handle "fused_elemwise_add_activation"
std::vector<std::string> functor_list = {"a", "b"};
operators_with_attrs["fused_elemwise_add_activation"] = {};
......@@ -2349,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
if (!CheckOpProto(op_proto)) continue;
const std::string& op_type = op_proto->type();
if (black_ops_list.count(op_type)) {
continue;
}
/* ----------------------------- */
/* ---- Collect Information ---- */
......
set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
......
......@@ -23,12 +23,13 @@ core_ops_returns_info = {}
core_ops_args_info = {}
core_ops_args_type_info = {}
namespace = ""
yaml_types_mapping = {
'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \
'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \
'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
'Tensor' : 'Tensor',
'Tensor[]' : 'std::vector<Tensor>',
'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
......@@ -125,6 +126,7 @@ def GetAutoGradMetaVectorName(string):
def ReadFwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
f.close()
return contents
......@@ -133,9 +135,13 @@ def ReadBwdFile(filepath):
contents = yaml.load(f, Loader=yaml.FullLoader)
ret = {}
for content in contents:
assert 'backward_api' in content.keys()
if 'backward_api' in content.keys():
api_name = content['backward_api']
else:
assert False
ret[api_name] = content
f.close()
return ret
......@@ -608,16 +614,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
returns_str += f"return returns;\n"
grad_node_name = GetGradNodeName(fwd_api_name)
if len(namespace) > 0:
grad_api_namespace = f"paddle::experimental::{namespace}"
else:
grad_api_namespace = f"paddle::experimental"
FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
// Call grad_api function
auto grad_api_returns = paddle::experimental::{}({});
auto grad_api_returns = {}::{}({});
{}
}}
"""
node_definition_str = FUNCTION_TEMPLATE.format(
grad_node_name, bwd_api_name, grad_api_args_str, returns_str)
grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str,
returns_str)
return node_definition_str
......@@ -671,7 +684,7 @@ def GenerateNodeCreationCodes(
else:
# Tuple api_result
if IsPlainTensorType(rtype):
outputs_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
......@@ -699,18 +712,24 @@ def GenerateNodeCreationCodes(
# SetTensorWrappers
set_tensor_wrappers_list = []
for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items():
is_optional = (name in optional_inputs)
if is_fwd_input:
if is_optional:
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);"
else:
if IsVectorTensorType(atype):
tw_name = f"api_result[{pos}]"
else:
tw_name = f"api_result"
if is_optional:
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);"
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);"
set_tensor_wrappers_list.append(set_tensor_wrappers)
set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
......@@ -850,6 +869,10 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
function_name = fwd_api_name
else:
function_name = fwd_api_name + "_intermediate"
if len(namespace) > 0:
forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
else:
forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
# Get return type list & outputs
......@@ -1000,7 +1023,9 @@ def GenerateNodeCCFile(filepath, node_definition_str):
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
#include "paddle/phi/api/include/sparse_api.h"
"""
file_contents += node_definition_str
with open(filepath, 'a') as f:
......@@ -1021,11 +1046,12 @@ def GenerateNodeHFile(filepath, node_declaration_str):
def GenerateForwardCCFile(filepath, forward_definition_str):
file_contents = """
#include "paddle/phi/api/lib/dygraph_api.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
"""
file_contents += GenerateCoreOpInfoDefinition()
......@@ -1042,6 +1068,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
#include "paddle/phi/api/all.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/to_static/run_program_op_func.h"
"""
file_contents += GenerateCoreOpInfoDeclaration()
......@@ -1053,17 +1080,32 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
if __name__ == "__main__":
args = ParseArguments()
api_yaml_path = args.api_yaml_path
backward_yaml_path = args.backward_yaml_path
fwd_api_list = ReadFwdFile(api_yaml_path)
grad_api_dict = ReadBwdFile(backward_yaml_path)
api_yaml_paths = args.api_yaml_path.split(",")
backward_yaml_paths = args.backward_yaml_path.split(",")
# Generate per Dygraph API
node_declaration_str = ""
node_definition_str = ""
forward_definition_str = ""
forward_declaration_str = ""
for i in range(len(api_yaml_paths)):
api_yaml_path = api_yaml_paths[i]
backward_yaml_path = backward_yaml_paths[i]
if "sparse" in api_yaml_path:
assert "sparse" in backward_yaml_path
namespace = "sparse"
else:
namespace = ""
fwd_api_list = ReadFwdFile(api_yaml_path)
grad_api_dict = ReadBwdFile(backward_yaml_path)
yaml_forward_definition_str = ""
yaml_forward_declaration_str = ""
yaml_node_declaration_str = ""
yaml_node_definition_str = ""
for fwd_api in fwd_api_list:
# We only generate Ops with grad
if 'backward' not in fwd_api.keys():
......@@ -1076,7 +1118,8 @@ if __name__ == "__main__":
no_need_buffer_set = set()
if 'no_need_buffer' in fwd_api.keys():
no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
'no_need_buffer'])
fwd_api_name = fwd_api['api']
fwd_args_str = fwd_api['args']
......@@ -1108,22 +1151,26 @@ if __name__ == "__main__":
intermediate_outputs = []
if 'intermediate' in fwd_api.keys():
intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
intermediate_outputs = ParseIntermediate(fwd_api[
'intermediate'])
IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
IntermediateValidationCheck(intermediate_outputs,
forward_returns_list)
# Collect Original Forward Inputs/Outputs and then perform validation checks
orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
fwd_args_str, fwd_returns_str)
print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
print("Parsed Original Forward Inputs List: ",
orig_forward_inputs_list)
print("Prased Original Forward Attrs List: ",
orig_forward_attrs_list)
print("Parsed Original Forward Returns List: ",
orig_forward_returns_list)
# Forward Validation Checks
ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
forward_returns_list, orig_forward_inputs_list,
orig_forward_attrs_list,
ForwardsValidationCheck(
forward_inputs_list, forward_attrs_list, forward_returns_list,
orig_forward_inputs_list, orig_forward_attrs_list,
orig_forward_returns_list)
# Parse Backward Inputs/Outputs
......@@ -1146,20 +1193,23 @@ if __name__ == "__main__":
backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map)
print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
print("Generated Backward Grad Input Map: ", backward_grad_input_map)
print("Generated Backward Grad Output Map: ", backward_grad_output_map)
print("Generated Backward Grad Input Map: ",
backward_grad_input_map)
print("Generated Backward Grad Output Map: ",
backward_grad_output_map)
# Backward Validation Check
BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
BackwardValidationCheck(backward_fwd_input_map,
backward_grad_input_map,
backward_attrs_list)
# Node Declaration Generation
node_declaration_str += GenerateNodeDeclaration(
yaml_node_declaration_str += GenerateNodeDeclaration(
fwd_api_name, backward_fwd_input_map, backward_attrs_list,
no_need_buffer_set)
print("Generated Node Declaration: ", node_declaration_str)
node_definition_str += GenerateNodeDefinition(
yaml_node_definition_str += GenerateNodeDefinition(
fwd_api_name, bwd_api_name, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list)
......@@ -1174,14 +1224,41 @@ if __name__ == "__main__":
intermediate_outputs)
print("Generated Forward Definition: ", forward_definition_str)
print("Generated Forward Declaration: ", forward_declaration_str)
forward_definition_str += definition_declaration_pair[0]
forward_declaration_str += definition_declaration_pair[1]
yaml_forward_definition_str += definition_declaration_pair[0]
yaml_forward_declaration_str += definition_declaration_pair[1]
# For python-level API dispatch
CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
forward_outputs_position_map,
forward_attrs_list)
if len(namespace) > 0:
forward_definition_str += f"""namespace {namespace} {{
{yaml_forward_definition_str}
}}
"""
forward_declaration_str += f"""namespace {namespace} {{
{yaml_forward_declaration_str}
}}
"""
node_declaration_str += f"""namespace {namespace} {{
{yaml_node_declaration_str}
}}
"""
node_definition_str += f"""namespace {namespace} {{
{yaml_node_definition_str}
}}
"""
else:
forward_definition_str += yaml_forward_definition_str
forward_declaration_str += yaml_forward_declaration_str
node_declaration_str += yaml_node_declaration_str
node_definition_str += yaml_node_definition_str
# Generate Files
nodes_h_path = args.nodes_h_path
nodes_cc_path = args.nodes_cc_path
......
......@@ -14,7 +14,7 @@
import os
import argparse
from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
skipped_fwd_api_names = set(["scale"])
......@@ -126,16 +126,20 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
}}
"""
namespace_str = ""
if len(namespace) > 0:
namespace_str = f"{namespace}::"
if is_forward_only:
fwd_function_name = fwd_api_name
fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name
else:
fwd_function_name = GetForwardFunctionName(fwd_api_name)
fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
fwd_function_name, dygraph_function_call_str)
python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
return python_c_function_str, python_c_function_reg_str
......@@ -189,7 +193,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
"""
core_ops_infos_registry = """
,{\"get_final_state_core_ops_args_info\",
{\"get_final_state_core_ops_args_info\",
(PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
\"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
{\"get_final_state_core_ops_args_type_info\",
......@@ -218,10 +222,12 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
#include "pybind11/detail/common.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/lib/dygraph_api.h"
#include "paddle/phi/common/backend.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/pybind/op_function_common.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/pybind/exception.h"
......@@ -254,7 +260,19 @@ def GeneratePythonCFile(filepath, python_c_str):
if __name__ == "__main__":
args = ParseArguments()
api_yaml_path = args.api_yaml_path
api_yaml_paths = args.api_yaml_path.split(",")
python_c_functions_reg_str = ""
python_c_functions_str = ""
for i in range(len(api_yaml_paths)):
api_yaml_path = api_yaml_paths[i]
if "sparse" in api_yaml_path:
namespace = "sparse"
else:
namespace = ""
fwd_api_list = ReadFwdFile(api_yaml_path)
python_c_function_list = []
......@@ -287,7 +305,8 @@ if __name__ == "__main__":
fwd_args_str, fwd_returns_str)
print("Parsed Original Forward Inputs List: ", forward_inputs_list)
print("Prased Original Forward Attrs List: ", forward_attrs_list)
print("Parsed Original Forward Returns List: ", forward_returns_list)
print("Parsed Original Forward Returns List: ",
forward_returns_list)
forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
......@@ -303,8 +322,18 @@ if __name__ == "__main__":
python_c_function_reg_list.append(python_c_function_reg_str)
print("Generated Python-C Function: ", python_c_function_str)
python_c_functions_str = "\n".join(python_c_function_list)
python_c_functions_reg_str = ",\n".join(python_c_function_reg_list)
# Append Namespace
python_c_functions_reg_str += ",\n".join(
python_c_function_reg_list) + ","
python_c_functions = "\n".join(python_c_function_list)
if len(namespace) > 0:
python_c_functions_str += f"""namespace {namespace} {{
{python_c_functions}
}}
"""
else:
python_c_functions_str += python_c_functions
python_c_str = GeneratePythonCWrappers(python_c_functions_str,
python_c_functions_reg_str)
......
......@@ -24,6 +24,8 @@
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
// TODO(jiabin): remove nolint here!!!
using namespace egr; // NOLINT
......
......@@ -33,6 +33,14 @@
#include "gperftools/profiler.h"
#endif
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
using namespace egr; // NOLINT
using namespace egr_utils_api; // NOLINT
......
......@@ -32,11 +32,19 @@
#include "gperftools/profiler.h"
#endif
#include "paddle/phi/core/kernel_registry.h"
using namespace egr; // NOLINT
using namespace egr_utils_api; // NOLINT
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
TEST(Benchmark, EagerScaleCUDA) {
eager_test::InitEnv(paddle::platform::CUDAPlace());
......@@ -186,7 +194,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
USE_OP_ITSELF(scale);
USE_OP_ITSELF(matmul_v2);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
......@@ -34,6 +34,14 @@
#include "gperftools/profiler.h"
#endif
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
namespace paddle {
namespace imperative {
......
......@@ -34,8 +34,16 @@
#include "gperftools/profiler.h"
#endif
#include "paddle/phi/core/kernel_registry.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
namespace paddle {
namespace imperative {
......@@ -248,7 +256,7 @@ TEST(Benchmark, FluidMLPCUDA) {
USE_OP_ITSELF(scale);
USE_OP_ITSELF(matmul_v2);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
......@@ -30,6 +30,10 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace egr {
TEST(Backward, SingleNodeEmptyGrad) {
......
......@@ -31,6 +31,10 @@
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace egr {
TEST(CrossBatchAccumulation, SingleScaleNode) {
......
......@@ -27,6 +27,10 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace egr {
TEST(Forward, SingleNode) {
......
......@@ -30,6 +30,13 @@
#include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
#endif
namespace egr {
paddle::experimental::Tensor hook_function(
......
......@@ -30,6 +30,12 @@
#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
namespace egr {
TEST(Generated, Sigmoid) {
......
......@@ -31,6 +31,10 @@
#include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace egr {
paddle::experimental::Tensor hook_function(
......
......@@ -27,6 +27,12 @@
#include "paddle/fluid/eager/hooks.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
namespace egr {
paddle::experimental::Tensor hook_function(
......
......@@ -23,6 +23,10 @@
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace egr {
TEST(TensorUtils, Test) {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
#include "paddle/fluid/eager/utils.h"
inline void run_program_dygraph_function(
const std::vector<paddle::experimental::Tensor>& x,
const std::vector<paddle::experimental::Tensor>& params,
std::vector<paddle::experimental::Tensor*>& out, // NOLINT
std::vector<paddle::framework::Scope*>& step_scope, // NOLINT
std::vector<paddle::experimental::Tensor*>& dout, // NOLINT
const paddle::framework::AttributeMap& attrs) {
VLOG(2) << "start run run_program";
// Call forward function
RunProgramAPI(x, params, out, step_scope, dout, attrs);
VLOG(2) << "start run run_program grad";
// Prepare Autograd Meta
auto deref_out = details::DereferenceTensors(out);
std::vector<egr::AutogradMeta*> p_autograd_x =
egr::EagerUtils::nullable_autograd_meta(x);
std::vector<egr::AutogradMeta*> p_autograd_params =
egr::EagerUtils::nullable_autograd_meta(params);
std::vector<egr::AutogradMeta*> p_autograd_outs =
egr::EagerUtils::nullable_autograd_meta(deref_out);
bool trace_backward = egr::Controller::Instance().HasGrad();
bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
trace_backward, &p_autograd_x, &p_autograd_params);
if (require_any_grad) {
std::vector<std::string> out_names;
for (auto& t : deref_out) {
out_names.emplace_back(t.name());
}
egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
// Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
grad_node->SetFwdOutNames(out_names);
// Set Attributes
grad_node->SetAttrMap(attrs);
// Set TensorWrappers
grad_node->SetFwdX(x);
grad_node->SetFwdParams(params);
grad_node->SetStepScope(step_scope);
// Set Grad out rank as same as fwd input and set stop gradient to bwd
grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
grad_node->SetGradInMeta(&p_autograd_outs, 0);
// Set Next Edges
grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
// Set History for output set current Grad Node for
egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
egr::EagerUtils::CheckAndRetainGrad(deref_out);
}
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/tensor_wrapper.h"
#include "paddle/fluid/operators/run_program_op.h"
#include "paddle/fluid/platform/enforce.h"
namespace details {
using Tensor = paddle::experimental::Tensor;
static std::vector<Tensor> DereferenceTensors(
const std::vector<Tensor *> &tensor_ptr) {
std::vector<Tensor> res;
for (auto *t : tensor_ptr) {
res.emplace_back(*t);
}
return res;
}
static std::vector<std::string> GetTensorsName(const std::vector<Tensor> &ins) {
std::vector<std::string> in_names;
for (auto &in_t : ins) {
in_names.emplace_back(in_t.name());
}
return in_names;
}
static std::vector<std::string> GetTensorsName(
const std::vector<Tensor *> &ins) {
std::vector<std::string> in_names;
for (auto *in_t : ins) {
in_names.emplace_back(in_t->name());
}
return in_names;
}
static void CheckInputVarStatus(const Tensor &tensor) {
PADDLE_ENFORCE_EQ(
tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true,
paddle::platform::errors::InvalidArgument(
"The input tensor %s of "
"RunProgram(Grad)Op holds "
"wrong type. Expect type is DenseTensor.",
tensor.name()));
PADDLE_ENFORCE_EQ(tensor.initialized(), true,
paddle::platform::errors::InvalidArgument(
"The tensor in input tensor %s of "
"RunProgram(Grad)Op "
"is not initialized.",
tensor.name()));
}
static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
const Tensor &dst_tensor) {
auto name = dst_tensor.name();
PADDLE_ENFORCE_EQ(dst_tensor.defined(), true,
paddle::platform::errors::InvalidArgument(
"dst_tensor shall be defined."));
if (phi::DenseTensor::classof(dst_tensor.impl().get())) {
auto &src_tensor = src_var.Get<phi::DenseTensor>();
PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument(
"The output tensor %s get from "
"RunProgram(Grad)Op's internal scope holds "
"wrong type. Expect type is DenseTensor",
name));
PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
paddle::platform::errors::InvalidArgument(
"The tensor in output tensor %s get from "
"RunProgram(Grad)Op's internal "
"scope is not initialized.",
name));
} else if (phi::SelectedRows::classof(dst_tensor.impl().get())) {
auto &src_tensor = src_var.Get<phi::SelectedRows>();
PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument(
"The output tensodfr %s get from "
"RunProgram(Grad)Op's internal scope holds "
"wrong type. Expect type is SelectedRows",
name));
PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
paddle::platform::errors::InvalidArgument(
"The tensor in output tensor %s get from "
"RunProgram(Grad)Op's "
"internal scope is not initialized.",
name));
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The RunProgram(Grad)Op only support output "
"variable of type LoDTensor or SelectedRows",
name));
}
}
static void ShareTensorsIntoScope(const std::vector<Tensor> &tensors,
paddle::framework::Scope *scope) {
for (size_t i = 0; i < tensors.size(); ++i) {
auto name = tensors[i].name();
if (name == "Fake_var" || !tensors[i].is_initialized()) {
continue;
}
auto *var = scope->Var(name);
CheckInputVarStatus(tensors[i]);
// share tensor
auto tensor_base = tensors[i].impl();
if (phi::DenseTensor::classof(tensor_base.get())) {
auto *dst_tensor = var->GetMutable<phi::DenseTensor>();
auto t = std::dynamic_pointer_cast<phi::DenseTensor>(tensor_base);
*dst_tensor = *t;
} else if (phi::SelectedRows::classof(tensor_base.get())) {
auto *dst_tensor = var->GetMutable<phi::SelectedRows>();
auto t = std::dynamic_pointer_cast<phi::SelectedRows>(tensor_base);
*dst_tensor = *t;
}
}
}
static void ShareTensorsFromScope(
const std::vector<Tensor *> &tensors,
const paddle::framework::BlockDesc &global_block,
paddle::framework::Scope *scope) {
for (size_t i = 0; i < tensors.size(); ++i) {
// NOTE: In case of setting out_tmp.stop_gradient = True in model code, all
// parameters before generating out_tmp have no @GRAD, it will raise error
// because we can't find them in scope. So we skip sharing these vars or
// var@GRAD if they don't appear in global block.
auto &name = tensors[i]->name();
if (name == paddle::framework::kEmptyVarName || name == "Fake_var" ||
!global_block.HasVar(name)) {
VLOG(2) << "find tensor name is " << name << ", skip it!";
continue;
}
// NOTE: Here skip not found var is dangerous, if a bug is caused here,
// the result is grad calculation error, which will be very hidden!
auto *var = scope->FindVar(name);
PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound(
"The output tensor %s is not in "
"RunProgram(Grad)Op'"
"s internal scope.",
name));
CheckOutputVarStatus(*var, *tensors[i]);
// share tensor
// TODO(dev): Determine Tensor type by scope.var
// auto tensor_base = tensors[i]->impl();
// if (phi::DenseTensor::classof(tensor_base.get())) {
if (var->IsType<phi::DenseTensor>()) {
auto &src_tensor = var->Get<phi::DenseTensor>();
auto *dst_tensor = const_cast<phi::DenseTensor *>(
dynamic_cast<const phi::DenseTensor *>(tensors[i]->impl().get()));
VLOG(2) << "share " << name << " from scope";
*dst_tensor = src_tensor;
} else if (var->IsType<phi::SelectedRows>()) {
// } else if (phi::SelectedRows::classof(tensor_base.get())) {
auto &src_tensor = var->Get<phi::SelectedRows>();
auto *dst_tensor = const_cast<phi::SelectedRows *>(
dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
*dst_tensor = src_tensor;
}
}
}
} // namespace details
inline void RunProgramAPI(
const std::vector<paddle::experimental::Tensor> &x,
const std::vector<paddle::experimental::Tensor> &params,
std::vector<paddle::experimental::Tensor *> &out, // NOLINT
std::vector<paddle::framework::Scope *> &step_scope, // NOLINT
std::vector<paddle::experimental::Tensor *> &dout, // NOLINT
const paddle::framework::AttributeMap &attrs) {
VLOG(2) << "RunProgramOpKernel Compute";
auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index"));
auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test"));
auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
// NOTE(chenweihang): In order not to add new variable type, use vector
// here. Originally, here can use scope directly.
auto *out_scope_vec = &step_scope;
PADDLE_ENFORCE_EQ(
out_scope_vec->size(), 1,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope."));
// Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single
// scope separately. Otherwise, the gradients can be miscalculated because
// always using the Tensor data of the last step in forward.
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
VLOG(2) << "The number of sub scopes before forward: "
<< out_scope_vec->front()->kids().size();
paddle::framework::Scope &scope = global_inner_scope->NewScope();
// share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, &scope);
details::ShareTensorsIntoScope(params, &scope);
auto *global_block =
BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
const auto &place = egr::Controller::Instance().GetExpectedPlace();
if (end_op_index > start_op_index) {
auto input_names = details::GetTensorsName(x);
auto output_names = details::GetTensorsName(out);
auto dout_names = details::GetTensorsName(dout);
auto *program = global_block->Program();
auto cache_info = paddle::framework::GetExecutorInfoFromCache(
*program, place, start_op_index, end_op_index,
/*is_grad=*/false, program_id, &scope);
auto &parallel_executor = cache_info.first;
// all out_vars are skip_eager_var
auto &skip_eager_delete_vars =
paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
program_id, false);
if (cache_info.second /*is_new_created*/) {
parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names);
skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
output_names.begin(), output_names.end());
skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
dout_names.begin(), dout_names.end());
paddle::framework::details::ParseSafeEagerDeletionSkipVars(
*program, end_op_index, output_names, &skip_eager_delete_vars);
}
// Step 3. run ops
parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
}
// Step 4. Get Output
details::ShareTensorsFromScope(out, *global_block, &scope);
details::ShareTensorsFromScope(dout, *global_block, &scope);
// Debug info: scope info when run end
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
// Step 5. Drop all children scopes while testing.
if (is_test) {
out_scope_vec->front()->DropKids();
}
VLOG(2) << "The number of sub scopes after forward: "
<< out_scope_vec->front()->kids().size();
// #ifdef PADDLE_WITH_MKLDNN
// if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
// #endif
}
inline void RunProgramGradAPI(
const std::vector<paddle::experimental::Tensor> &x,
const std::vector<paddle::experimental::Tensor> &params,
const std::vector<paddle::experimental::Tensor> &out_grad,
const std::vector<paddle::framework::Scope *> &step_scope, // NOLINT
const paddle::framework::AttributeMap &attrs,
std::vector<paddle::experimental::Tensor *> &x_grad, // NOLINT
std::vector<paddle::experimental::Tensor *> &params_grad // NOLINT
) {
// if all output vars are set to stop_gradient, grad op no need to executed
if (x_grad.empty() && params_grad.empty()) return;
// TODO(dev): Remove this line hard code. And need to deal with the out_grad
// name problem.
// const_cast<paddle::experimental::Tensor &>(out_grad[0])
// .set_name("matmul_v2_0.tmp_0@GRAD");
auto *global_block =
BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
// NOTE: skip `shape` and `fill_constant` op created by
// fluid.backward.gradients, one forward output will generate one `shape`
// and `fill_constant`
int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2);
int64_t end_op_index = global_block->OpSize();
auto *out_scope_vec = &step_scope;
PADDLE_ENFORCE_EQ(
out_scope_vec->size(), 1,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope."));
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto sub_scope_num = global_inner_scope->kids().size();
VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
PADDLE_ENFORCE_GT(sub_scope_num, 0,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should hold at "
"least one sub scope."));
auto &scope = *(global_inner_scope->kids().front());
const auto &place = egr::Controller::Instance().GetExpectedPlace();
if (end_op_index > start_op_index) {
auto out_grad_names = details::GetTensorsName(out_grad);
// NOTE: after PR22939 [Add double grad] merged, the grad op maker's
// SetOutput will set to None if the input var stop_gradient=True,
// it will cause an NotFound error when ctx.OutputNames() is called
std::vector<std::string> x_grad_names;
std::vector<std::string> param_grad_names;
if (!x_grad.empty()) {
x_grad_names = details::GetTensorsName(x_grad);
}
if (!params_grad.empty()) {
param_grad_names = details::GetTensorsName(params_grad);
}
// Step 2. prepare executor and scope
auto *program = global_block->Program();
auto cache_info = paddle::framework::GetExecutorInfoFromCache(
*program, place, start_op_index, end_op_index,
/*is_grad*/ true, program_id, &scope);
auto &parallel_executor = cache_info.first;
auto &skip_eager_delete_vars =
paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
program_id, true);
if (cache_info.second /*is_new_created*/) {
parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names);
skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
x_grad_names.begin(), x_grad_names.end());
paddle::framework::details::AppendSkipDeletionVars(
param_grad_names, &skip_eager_delete_vars);
}
details::ShareTensorsIntoScope(out_grad, &scope);
// Debug info: scope info when run end
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
// Step 3. run ops
parallel_executor->RunWithoutFetch(
/*skip_eager_delete_vars=*/skip_eager_delete_vars);
}
// Step 4. get outputs
details::ShareTensorsFromScope(x_grad, *global_block, &scope);
details::ShareTensorsFromScope(params_grad, *global_block, &scope);
// Step5. drop current scope
// global_inner_scope->DeleteScope(&scope);
VLOG(2) << "The number of sub scopes after backward: "
<< global_inner_scope->kids().size();
}
class GradNodeRunProgram : public egr::GradNodeBase {
public:
GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
: egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
~GradNodeRunProgram() override = default;
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
override {
VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
PADDLE_ENFORCE_EQ(
grads.size(), 1,
paddle::platform::errors::InvalidArgument(
"The out_grads.size() of RunProgramGradOp should be equal to 1."));
VLOG(3) << "out_grads[0].size() : " << grads[0].size();
std::vector<paddle::experimental::Tensor> x_grad;
std::vector<paddle::experimental::Tensor> params_grad;
ConstructGradTensors(x_, &x_grad);
ConstructGradTensors(params_, &params_grad);
std::vector<paddle::experimental::Tensor *> x_grad_ptr;
std::vector<paddle::experimental::Tensor *> params_grad_ptr;
for (auto &i : x_grad) {
x_grad_ptr.emplace_back(&i);
}
for (auto &i : params_grad) {
params_grad_ptr.emplace_back(&i);
}
// auto x_grad_ptr = ConstructGradTensors(x_);
// auto params_grad_ptr = ConstructGradTensors(params_);
PADDLE_ENFORCE_EQ(
grads[0].size(), fwd_out_names_.size(),
paddle::platform::errors::InvalidArgument(
"The grads[0].size() and fwd_out_names_.size() should be equal."));
for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
const_cast<paddle::experimental::Tensor &>(grads[0][i])
.set_name(fwd_out_names_[i] + "@GRAD");
}
RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr,
params_grad_ptr);
VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
return {x_grad, params_grad};
// return {x_grad, details::DereferenceTensors(params_grad_ptr)};
}
// SetAttrMap
void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
attrs_ = attrs;
}
void SetFwdX(const std::vector<paddle::experimental::Tensor> &tensors) {
x_ = tensors;
}
void SetFwdParams(const std::vector<paddle::experimental::Tensor> &tensors) {
params_ = tensors;
}
void SetStepScope(const std::vector<paddle::framework::Scope *> &scopes) {
step_scope_ = scopes;
}
void SetFwdOutNames(std::vector<std::string> out_names) {
fwd_out_names_ = out_names;
}
protected:
void ConstructGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors,
std::vector<paddle::experimental::Tensor> *grad_tensors) {
// TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
// such as: name, tensor type(DenseTensor or SelectedRows).
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
for (auto &fwd_t : fwd_tensors) {
grad_tensors->emplace_back(fwd_t.impl());
auto &grad_t = grad_tensors->back();
grad_t.set_name(fwd_t.name() + "@GRAD");
}
}
void ConstructGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors) {
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
for (auto &fwd_t : fwd_tensors) {
auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad();
grad_tesnor.set_name(fwd_t.name() + "@GRAD");
}
}
private:
// TensorWrappers
std::vector<paddle::experimental::Tensor> x_;
std::vector<paddle::experimental::Tensor> params_;
std::vector<paddle::framework::Scope *> step_scope_;
std::vector<std::string> fwd_out_names_;
// Attribute Map
paddle::framework::AttributeMap attrs_;
};
......@@ -443,7 +443,7 @@ cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framewo
#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel)
set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
......
......@@ -90,6 +90,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
bool IsForInferShape() const override { return true; }
bool IsRuntime() const override { return ctx_.IsRuntime(); }
private:
const InferShapeContext& ctx_;
};
......@@ -232,16 +234,8 @@ class CompatMetaTensor : public phi::MetaTensor {
}
}
void share_meta(const MetaTensor& meta_tensor) override {
void share_dims(const MetaTensor& meta_tensor) override {
set_dims(meta_tensor.dims());
set_dtype(meta_tensor.dtype());
// VarDesc doesn't contains layout, so we cannot share layout
// set_layout(meta_tensor.layout());
// special case 1: share lod of LoDTensor
share_lod(meta_tensor);
// special case 2: share height and rows of SelectedRows in runtime
if (is_runtime_) {
auto* var = BOOST_GET(Variable*, var_);
if (var->IsType<phi::SelectedRows>()) {
......@@ -254,6 +248,16 @@ class CompatMetaTensor : public phi::MetaTensor {
}
}
void share_meta(const MetaTensor& meta_tensor) override {
set_dtype(meta_tensor.dtype());
// VarDesc doesn't contains layout, so we cannot share layout
// set_layout(meta_tensor.layout());
// special case 1: share lod of LoDTensor
share_lod(meta_tensor);
share_dims(meta_tensor);
}
private:
const LoD& GetRuntimeLoD() const {
auto* var = BOOST_GET_CONST(Variable*, var_);
......
......@@ -20,12 +20,15 @@
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/phi/core/kernel_registry.h"
USE_OP_ITSELF(scale);
USE_OP(elementwise_mul);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(elementwise_add_grad);
PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
DECLARE_double(eager_delete_tensor_gb);
namespace paddle {
......
......@@ -15,8 +15,9 @@
#include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
#include <gtest/gtest.h>
#include <boost/logic/tribool.hpp>
#include <unordered_set>
#include <boost/logic/tribool.hpp>
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -27,7 +28,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP(leaky_relu);
USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
USE_OP(gelu);
USE_OP(relu);
USE_OP_ITSELF(relu);
USE_OP(tanh);
USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
......
......@@ -37,7 +37,7 @@ USE_OP(elementwise_mul);
USE_OP(softmax_with_cross_entropy);
USE_OP_ITSELF(reduce_mean);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(reduce_sum_grad);
USE_OP(reduce_mean_grad);
USE_OP_ITSELF(reshape2_grad);
USE_OP(softmax_with_cross_entropy_grad);
......@@ -46,7 +46,7 @@ USE_OP(matmul_grad);
USE_OP(square);
USE_OP(transpose2_grad);
USE_OP(concat_grad);
USE_OP(elementwise_mul_grad);
USE_OP_ITSELF(elementwise_mul_grad);
USE_OP(sigmoid_grad);
USE_OP(tanh_grad);
USE_OP(sum);
......
......@@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const {
return var != nullptr;
}
bool ExecutionContext::HasInputs(const std::string& name) const {
const auto& ins = ctx_.inputs;
auto it = ins.find(name);
if (it == ins.end() || it->second.empty()) {
return false;
}
for (const auto* input : it->second) {
if (input == nullptr) {
return false;
}
}
return true;
}
bool ExecutionContext::HasOutput(const std::string& name) const {
auto* var = OutputVar(name);
return var != nullptr;
......@@ -2189,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext(
std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
}
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<phi::Scalar>))) {
auto& attr = Attrs().at(attr_names[i]);
if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int32_t>))) {
const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int64_t>))) {
const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<float>))) {
const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<double>))) {
const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported cast op attribute `%s` to vector<Scalar> when "
"construct KernelContext.",
attr_names[i]));
}
} else {
// TODO(chenweihang): support other attrs later
auto& attr = Attrs().at(attr_names[i]);
......@@ -2212,6 +2271,10 @@ void OperatorWithKernel::BuildPhiKernelContext(
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int64_t>))) {
if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int64_t>))) {
pt_kernel_context->EmplaceBackAttr(
BOOST_GET_CONST(std::vector<int64_t>, attr));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int>))) {
// Emplace Back Attr according to the type of Phi_Kernel args.
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
......
......@@ -295,6 +295,8 @@ class ExecutionContext {
virtual bool HasInput(const std::string& name) const;
virtual bool HasInputs(const std::string& name) const;
virtual bool HasOutput(const std::string& name) const;
virtual size_t InputSize(const std::string& name) const {
......@@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
: ctx_(ctx) {}
bool HasInput(const std::string& name) const override {
return ctx_.HasInput(name);
return ctx_.HasInputs(name);
}
bool HasOutput(const std::string& name) const override {
......
......@@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
USE_PASS(build_cinn_pass);
USE_OP(mul);
USE_OP(relu);
USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add);
USE_OP(relu_grad);
USE_OP_ITSELF(relu_grad);
USE_OP_ITSELF(elementwise_add_grad);
......@@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) {
USE_PASS(build_cinn_pass);
USE_PASS(graph_viz_pass);
USE_OP(mul);
USE_OP(relu);
USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add);
......@@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext {
return (it != var_map_in_.end() && it->second.size() > 0);
}
bool HasInputs(const std::string& name) const override {
auto it = var_map_in_.find(name);
return (it != var_map_in_.end() && it->second.size() > 0);
}
bool HasOutput(const std::string& name) const override {
auto it = var_map_out_.find(name);
return (it != var_map_out_.end() && it->second.size() > 0);
......
......@@ -247,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
#endif
#ifdef PADDLE_WITH_XPU_KP
expected_kernel_key.place_ = platform::XPUPlace();
bool use_xpu_kp_kernel_rt =
FLAGS_run_kp_kernel &&
paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
......
......@@ -332,6 +332,7 @@ void BuildDygraphPhiKernelContext(
}
for (size_t i = 0; i < attr_names.size(); ++i) {
VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
if (attrs.find(attr_names[i]) !=
attrs.end()) { // shape is in the attribute
......@@ -409,6 +410,60 @@ void BuildDygraphPhiKernelContext(
experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
}
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<phi::Scalar>))) {
auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int32_t>))) {
const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int64_t>))) {
const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<float>))) {
const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<double>))) {
const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<bool>))) {
const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
std::vector<phi::Scalar> scalar_list;
scalar_list.reserve(vec.size());
for (const auto& val : vec) {
scalar_list.emplace_back(val);
}
kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported cast op attribute `%s` to vector<Scalar> when "
"construct KernelContext.",
attr_names[i]));
}
} else {
// TODO(chenweihang): support other attrs later
auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
......@@ -432,6 +487,10 @@ void BuildDygraphPhiKernelContext(
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int64_t>))) {
if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int64_t>))) {
kernel_ctx->EmplaceBackAttr(
BOOST_GET_CONST(std::vector<int64_t>, attr));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int>))) {
// Emplace Back Attr according to the type of Phi_Kernel args.
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
......
......@@ -24,6 +24,10 @@
#include "paddle/fluid/imperative/hooks.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
namespace platform = paddle::platform;
namespace framework = paddle::framework;
......
......@@ -24,6 +24,13 @@
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT);
#endif
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
......@@ -226,7 +233,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
} // namespace paddle
USE_OP_ITSELF(split);
USE_OP(relu);
USE_OP_ITSELF(relu);
#ifdef PADDLE_WITH_MKLDNN
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
#endif
......@@ -28,6 +28,13 @@
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
#endif
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
......@@ -591,5 +598,5 @@ TEST(test_tracer, eager_tracer) {
USE_OP(mul);
USE_OP(mul_grad);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);
......@@ -45,6 +45,11 @@ add_subdirectory(api)
set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
if(WITH_ONNXRUNTIME)
set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor)
endif()
#TODO(wilber, T8T9): Do we still need to support windows gpu static library?
if(WIN32 AND WITH_GPU)
cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
......@@ -91,6 +96,13 @@ if (WITH_PSCORE)
set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service)
endif ()
if (WITH_ONNXRUNTIME)
set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS}
${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc
)
set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor)
endif (WITH_ONNXRUNTIME)
# Create shared inference library
cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${SHARED_INFERENCE_DEPS})
......
......@@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel)
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
if(WITH_CRYPTO)
list(APPEND paddle_inference_api_deps paddle_crypto)
......@@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
endif()
cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
if (WITH_ONNXRUNTIME)
cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor)
else (WITH_ONNXRUNTIME)
cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
endif (WITH_ONNXRUNTIME)
cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
......@@ -75,6 +82,16 @@ elseif (WIN32)
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
if (WITH_ONNXRUNTIME)
if (NOT APPLE AND NOT WIN32)
cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared
ARGS --dirname=${MOBILENETV2_MODEL_DIR})
elseif (WIN32)
cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps}
ARGS --dirname=${MOBILENETV2_MODEL_DIR})
endif()
endif()
if(WITH_TESTING AND WITH_MKLDNN)
if (NOT APPLE AND NOT WIN32)
cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
......
......@@ -168,6 +168,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
Update();
}
void AnalysisConfig::EnableONNXRuntime() {
#ifdef PADDLE_WITH_ONNXRUNTIME
use_onnxruntime_ = true;
#else
LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()";
use_onnxruntime_ = false;
#endif
Update();
}
void AnalysisConfig::DisableONNXRuntime() {
use_onnxruntime_ = false;
Update();
}
void AnalysisConfig::EnableORTOptimization() {
#ifdef PADDLE_WITH_ONNXRUNTIME
enable_ort_optimization_ = true;
#else
LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()";
enable_ort_optimization_ = false;
#endif
Update();
}
AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__;
......
......@@ -65,6 +65,10 @@
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#endif
#ifdef PADDLE_WITH_ONNXRUNTIME
#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
#endif
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
......@@ -1762,6 +1766,27 @@ namespace paddle_infer {
Predictor::Predictor(const Config &config) {
const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
// The second parameter indicates that the discard log is not printed
if (config.use_onnxruntime()) {
#ifdef PADDLE_WITH_ONNXRUNTIME
if (config.use_gpu()) {
LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU,"
"and it falls back to use Paddle Inference.";
} else if (!paddle::CheckConvertToONNX(config)) {
LOG(WARNING)
<< "Paddle2ONNX do't support convert the Model, fall back to using "
"Paddle Inference.";
} else {
predictor_ = paddle::CreatePaddlePredictor<
Config, paddle::PaddleEngineKind::kONNXRuntime>(config);
return;
}
#else
LOG(WARNING)
<< "The onnxruntime backend isn't enabled,"
" and please re-compile Paddle with WITH_ONNXRUNTIME option,"
"fall back to using Paddle Inference.";
#endif
}
predictor_ = paddle::CreatePaddlePredictor<
Config, paddle::PaddleEngineKind::kAnalysis>(config);
}
......
......@@ -357,6 +357,24 @@ TEST(AnalysisPredictor, set_xpu_device_id) {
}
#endif
TEST(AnalysisPredictor, enable_onnxruntime) {
AnalysisConfig config;
config.EnableONNXRuntime();
#ifdef PADDLE_WITH_ONNXRUNTIME
ASSERT_TRUE(config.use_onnxruntime());
#else
ASSERT_TRUE(!config.use_onnxruntime());
#endif
config.EnableORTOptimization();
#ifdef PADDLE_WITH_ONNXRUNTIME
ASSERT_TRUE(config.ort_optimization_enabled());
#else
ASSERT_TRUE(!config.ort_optimization_enabled());
#endif
config.DisableONNXRuntime();
ASSERT_TRUE(!config.use_onnxruntime());
}
} // namespace paddle
namespace paddle_infer {
......@@ -408,6 +426,14 @@ TEST(Predictor, Run) {
predictor->TryShrinkMemory();
}
TEST(Predictor, EnableONNXRuntime) {
Config config;
config.SetModel(FLAGS_dirname);
config.EnableONNXRuntime();
config.EnableORTOptimization();
auto predictor = CreatePredictor(config);
}
TEST(Tensor, CpuShareExternalData) {
Config config;
config.SetModel(FLAGS_dirname);
......
......@@ -4,6 +4,7 @@ option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL.
option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON)
option(USE_TENSORRT "Compile demo with TensorRT." OFF)
option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF)
if(NOT WITH_STATIC_LIB)
add_definitions("-DPADDLE_WITH_SHARED_LIB")
......@@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
link_directories("${PADDLE_LIB}/paddle/lib")
if (WITH_ONNXRUNTIME)
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
endif()
if (WIN32)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
......@@ -151,6 +159,17 @@ else()
endif()
endif()
if (WITH_ONNXRUNTIME)
if(WIN32)
set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
elseif(APPLE)
set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
else()
set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
endif()
endif()
if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
......@@ -213,6 +232,14 @@ if(WIN32)
COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
)
endif()
if(WITH_ONNXRUNTIME)
add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
)
endif()
if(NOT WITH_STATIC_LIB)
add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains demo of mobilenet for tensorrt.
*/
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include <vector>
#include "gflags/gflags.h"
#include "utils.h" // NOLINT
DEFINE_string(modeldir, "", "Directory of the inference model.");
namespace paddle {
namespace demo {
/*
* Use the onnxruntime engine to inference the demo.
*/
void Main() {
paddle::AnalysisConfig config;
config.EnableONNXRuntime();
config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
FLAGS_modeldir + "/inference.pdiparams");
auto predictor = paddle_infer::CreatePredictor(config);
// Inference.
std::vector<int> input_shape = {1, 3, 224, 224};
std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
std::vector<float> out_data;
out_data.resize(1000);
auto input_names = predictor->GetInputNames();
auto output_names = predictor->GetOutputNames();
auto input_tensor = predictor->GetInputHandle(input_names[0]);
input_tensor->Reshape(input_shape);
auto output_tensor = predictor->GetOutputHandle(output_names[0]);
input_tensor->CopyFromCpu(input_data.data());
predictor->Run();
output_tensor->CopyToCpu(out_data.data());
VLOG(3) << "output.size " << out_data.size();
}
} // namespace demo
} // namespace paddle
int main(int argc, char** argv) {
::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
paddle::demo::Main();
return 0;
}
......@@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
DATA_DIR=$4 # dataset
USE_TENSORRT=$5
TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr
MSVC_STATIC_CRT=$7
WITH_ONNXRUNTIME=$7
MSVC_STATIC_CRT=$8
inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
......@@ -38,6 +39,26 @@ else
use_gpu_list='false'
fi
mkdir -p $DATA_DIR
cd $DATA_DIR
if [ $7 == ON ]; then
ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB}
PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB}
#download model
mkdir -p MobileNetV2
cd MobileNetV2
if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
echo "MobileNetV2.inference.model.tar.gz has been downloaded."
else
wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
tar xzf *.tar.gz
fi
cd ..
fi
PREFIX=inference-vis-demos%2F
URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
......@@ -58,8 +79,7 @@ function download() {
fi
cd ..
}
mkdir -p $DATA_DIR
cd $DATA_DIR
vis_demo_list='se_resnext50 ocr mobilenet'
for vis_demo_name in $vis_demo_list; do
download $vis_demo_name
......@@ -93,7 +113,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DDEMO_NAME=simple_on_word2vec \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
for use_gpu in $use_gpu_list; do
Release/simple_on_word2vec.exe \
......@@ -112,7 +133,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DDEMO_NAME=vis_demo \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
for use_gpu in $use_gpu_list; do
for vis_demo_name in $vis_demo_list; do
......@@ -138,7 +160,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
-DUSE_TENSORRT=$USE_TENSORRT \
-DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
-DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
Release/trt_mobilenet_demo.exe \
--modeldir=$DATA_DIR/mobilenet/model \
......@@ -156,7 +179,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=simple_on_word2vec \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
make -j$(nproc)
word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
if [ -d $word2vec_model ]; then
......@@ -176,7 +200,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=vis_demo \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
make -j$(nproc)
for use_gpu in $use_gpu_list; do
for vis_demo_name in $vis_demo_list; do
......@@ -200,7 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DUSE_TENSORRT=$USE_TENSORRT \
-DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
-DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
make -j$(nproc)
./trt_mobilenet_demo \
--modeldir=$DATA_DIR/mobilenet/model \
......@@ -211,6 +237,26 @@ for WITH_STATIC_LIB in ON OFF; do
exit 1
fi
fi
# --------onnxruntime mobilenetv2 on linux/mac------
if [ $WITH_ONNXRUNTIME == ON ]; then
rm -rf *
cmake .. -DPADDLE_LIB=${inference_install_dir} \
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=onnxruntime_mobilenet_demo \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DUSE_TENSORRT=$USE_TENSORRT \
-DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
make -j$(nproc)
./onnxruntime_mobilenet_demo \
--modeldir=$DATA_DIR/MobileNetV2/MobileNetV2
if [ $? -ne 0 ]; then
echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail."
exit 1
fi
fi
fi
done
set +x
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
#include <glog/logging.h>
#include <algorithm>
#include <fstream>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid//platform/device/gpu/gpu_types.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
framework::proto::VarType::Type ConvertONNXType(
ONNXTensorElementDataType type) {
switch (type) {
case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
return framework::proto::VarType::FP32;
// case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
// return DataType::FP16;
case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
return framework::proto::VarType::INT8;
case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
return framework::proto::VarType::INT32;
case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
return framework::proto::VarType::INT64;
case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
return framework::proto::VarType::UINT8;
default:
LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
return framework::proto::VarType::FP32;
}
}
bool CheckConvertToONNX(const AnalysisConfig &config) {
if (!config.model_dir().empty()) {
LOG(ERROR) << "Paddle2ONNX not support model_dir config";
// TODO(heliqi jiangjiajun): Paddle2ONNX not support
// config.model_dir() + "/__model__"
// config.model_dir() + var_name
return false;
} else if (config.prog_file().empty() || config.params_file().empty()) {
LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s' or params path '%s'.",
config.model_dir(), config.prog_file(), config.params_file());
return false;
}
return paddle2onnx::IsExportable(config.prog_file(), config.params_file(),
config.model_from_memory());
}
bool ONNXRuntimePredictor::Init() {
VLOG(3) << "ONNXRuntime Predictor::init()";
// Now ONNXRuntime only suuport CPU
if (config_.use_gpu()) {
place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
} else {
place_ = paddle::platform::CPUPlace();
}
scope_.reset(new paddle::framework::Scope());
sub_scope_ = &scope_->NewScope();
std::string onnx_proto;
paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
config_.model_from_memory());
Ort::SessionOptions session_options;
if (config_.ort_optimization_enabled()) {
session_options.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_ENABLE_ALL);
}
// Turn optimization off first, and then turn it on when it's stable
// session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
// session_options.EnableCpuMemArena();
// session_options.EnableMemPattern();
// session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads());
session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads());
VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads();
if (config_.profile_enabled()) {
LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the "
"performance";
#if defined(_WIN32)
session_options.EnableProfiling(L"ONNX");
#else
session_options.EnableProfiling("ONNX");
#endif
} else {
VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report "
"will be "
"generated.";
}
session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Allocator allocator(session_, memory_info);
framework::proto::VarType::Type proto_type =
framework::proto::VarType::LOD_TENSOR;
size_t n_inputs = session_.GetInputCount();
for (size_t i = 0; i < n_inputs; ++i) {
auto input_name = session_.GetInputName(i, allocator);
auto type_info = session_.GetInputTypeInfo(i);
std::vector<int64_t> shape =
type_info.GetTensorTypeAndShapeInfo().GetShape();
ONNXTensorElementDataType data_type =
type_info.GetTensorTypeAndShapeInfo().GetElementType();
input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
auto *ptr = scope_->Var(input_name);
framework::InitializeVariable(ptr, proto_type);
allocator.Free(input_name);
}
size_t n_outputs = session_.GetOutputCount();
for (size_t i = 0; i < n_outputs; ++i) {
auto output_name = session_.GetOutputName(i, allocator);
auto type_info = session_.GetOutputTypeInfo(i);
std::vector<int64_t> shape =
type_info.GetTensorTypeAndShapeInfo().GetShape();
ONNXTensorElementDataType data_type =
type_info.GetTensorTypeAndShapeInfo().GetElementType();
output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
auto *ptr = scope_->Var(output_name);
framework::InitializeVariable(ptr, proto_type);
allocator.Free(output_name);
}
return true;
}
template <>
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
const AnalysisConfig &config) {
if (config.glog_info_disabled()) {
FLAGS_logtostderr = 1;
FLAGS_minloglevel = 2; // GLOG_ERROR
}
PADDLE_ENFORCE_EQ(
config.is_valid(), true,
platform::errors::InvalidArgument(
"Note: Each config can only be used for one predictor."));
VLOG(3) << "create ONNXRuntimePredictor";
std::unique_ptr<PaddlePredictor> predictor(new ONNXRuntimePredictor(config));
// Each config can only be used for one predictor.
config.SetInValid();
auto predictor_p = dynamic_cast<ONNXRuntimePredictor *>(predictor.get());
if (!predictor_p->Init()) {
return nullptr;
}
return predictor;
}
std::vector<std::string> ONNXRuntimePredictor::GetInputNames() {
std::vector<std::string> input_names;
for (auto input_desc : input_desc_) {
input_names.push_back(input_desc.name);
}
return input_names;
}
std::map<std::string, std::vector<int64_t>>
ONNXRuntimePredictor::GetInputTensorShape() {
std::map<std::string, std::vector<int64_t>> input_shapes;
for (auto input_desc : input_desc_) {
input_shapes[input_desc.name] = input_desc.shape;
}
return input_shapes;
}
std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
std::vector<std::string> output_names;
for (auto output_desc : output_desc_) {
output_names.push_back(output_desc.name);
}
return output_names;
}
std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
const std::string &name) {
PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
platform::errors::PreconditionNotMet(
"The in variable named %s is not found in the "
"scope of the ONNXPredictor.",
name));
std::unique_ptr<ZeroCopyTensor> res(
new ZeroCopyTensor(static_cast<void *>(scope_.get())));
res->input_or_output_ = true;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = place_;
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
const std::string &name) {
PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
platform::errors::PreconditionNotMet(
"The out variable named %s is not found in the "
"scope of the ONNXPredictor.",
name));
std::unique_ptr<ZeroCopyTensor> res(
new ZeroCopyTensor(static_cast<void *>(scope_.get())));
res->input_or_output_ = false;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = place_;
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
const char *device_name) {
Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
place_.GetDeviceId(), OrtMemTypeDefault);
auto *var = scope_->FindVar(desc.name);
auto *tensor = var->GetMutable<framework::LoDTensor>();
size_t size =
tensor->numel() *
framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
return Ort::Value::CreateTensor(memory_info,
static_cast<void *>(tensor->data()), size,
shape.data(), shape.size(), desc.dtype);
}
void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
const ONNXDesc &desc) {
auto info = value.GetTensorTypeAndShapeInfo();
auto *var = scope_->FindVar(desc.name);
auto *tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(phi::make_ddim(info.GetShape()));
auto dtype = ConvertONNXType(info.GetElementType());
auto *ptr = tensor->mutable_data(place_, dtype);
if (platform::is_cpu_place(place_)) {
std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
tensor->numel() * framework::SizeOfType(dtype));
} else {
auto src_place = place_;
auto dst_place = place_;
memory::Copy(dst_place, ptr, src_place,
const_cast<void *>(value.GetTensorData<void>()),
tensor->numel() * framework::SizeOfType(dtype));
}
}
bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
LOG(ERROR) << "Not support Run";
return false;
}
bool ONNXRuntimePredictor::ZeroCopyRun() {
try {
Ort::IoBinding binding(session_);
std::vector<Ort::Value> inputs;
std::vector<Ort::Value> outputs;
Ort::RunOptions options;
inputs.reserve(input_desc_.size());
const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
for (auto desc : input_desc_) {
inputs.push_back(GetOrtValue(desc, device_name));
binding.BindInput(desc.name.c_str(), inputs.back());
}
// TODO(heliqi): Optimization —— move to Init()
for (auto desc : output_desc_) {
Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
place_.GetDeviceId(), OrtMemTypeDefault);
binding.BindOutput(desc.name.c_str(), memory_info);
}
session_.Run({}, binding);
outputs = binding.GetOutputValues();
for (size_t i = 0; i < output_desc_.size(); ++i) {
AsTensor(outputs[i], output_desc_[i]);
}
} catch (const std::exception &e) {
LOG(ERROR) << e.what();
return false;
}
return true;
}
std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone() {
LOG(ERROR) << "Not support Clone(), Please create new Predictor";
return nullptr;
}
uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
return paddle::memory::Release(place_);
}
ONNXRuntimePredictor::~ONNXRuntimePredictor() {
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
memory::Release(place_);
}
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/op_compatible_info.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/string/printf.h"
#include "onnxruntime_c_api.h" // NOLINT
#include "onnxruntime_cxx_api.h" // NOLINT
#include "paddle2onnx/converter.h"
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h>
#include <gtest/gtest_prod.h>
#endif
///
/// \file onnxruntime_predictor.h
///
/// \brief A predictor using ONNXRuntime
///
/// \author heliqi@baidu.com
/// \date 2022-02-14
/// \since 2.3.0
///
namespace paddle {
bool CheckConvertToONNX(const AnalysisConfig &config);
struct ONNXDesc {
std::string name;
std::vector<int64_t> shape;
ONNXTensorElementDataType dtype;
};
///
/// \class ONNXRuntimePredictor
///
/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference
///
/// The predictor has the following typical uses:
///
/// Get predictor
/// \code{cpp}
/// auto predictor = CreatePaddlePredictor(config);
/// \endcode
///
/// Get input or output names
/// \code{cpp}
/// auto input_names = predictor->GetInputNames();
/// auto output_names = predictor->GetOutputNames();
/// \endcode
///
/// Get input or output tensors
/// \code{cpp}
/// auto input_t = predictor->GetInputTensor(input_names[0]);
/// auto output_t = predictor->GetOutputTensor(output_names[0]);
/// \endcode
///
/// Run predictor
/// \code{cpp}
/// predictor->ZeroCopyRun();
/// \endcode
///
class ONNXRuntimePredictor : public PaddlePredictor {
public:
///
/// \brief Construct a new ONNXRuntime Predictor object
///
/// \param[in] AnalysisConfig config
///
explicit ONNXRuntimePredictor(const AnalysisConfig &config)
: config_(config) {
predictor_id_ = inference::GetUniqueId();
env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
}
///
/// \brief Destroy the ONNXRuntime Predictor object
///
~ONNXRuntimePredictor();
///
/// \brief Initialize predictor
///
/// \return Whether the init function executed successfully
///
bool Init();
///
/// \brief Get the input names
///
/// \return input names
///
std::vector<std::string> GetInputNames();
///
/// \brief Get the output names
///
/// \return output names
///
std::vector<std::string> GetOutputNames();
///
/// \brief Get the Input Tensor object
///
/// \param[in] name input name
/// \return input tensor
///
std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override;
///
/// \brief Get the Output Tensor object
///
/// \param[in] name otuput name
/// \return output tensor
///
std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
const std::string &name) override;
///
/// \brief Get all input names and their corresponding shapes
///
/// \return the map of input names and shapes
///
std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override;
/// Not supoort
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size = -1) override;
///
/// \brief Run the prediction engine
///
/// \return Whether the function executed successfully
///
bool ZeroCopyRun() override;
///
/// \brief Release all tmp tensor to compress the size of the memory pool.
/// The memory pool is considered to be composed of a list of chunks, if
/// the chunk is not occupied, it can be released.
///
/// \return Number of bytes released. It may be smaller than the actual
/// released memory, because part of the memory is not managed by the
/// MemoryPool.
///
uint64_t TryShrinkMemory() override;
///
/// \brief Clone to get the new predictor. thread safe.
///
/// \return get a new predictor
///
std::unique_ptr<PaddlePredictor> Clone() override;
std::shared_ptr<framework::Scope> scope_;
private:
///
/// \brief get the Ort Value(input Tensor).
///
/// \param[in] desc ONNXDesce(name、shape、dtype)
///
/// \param[in] device_name "cpu" or "gpu" of device
///
/// \return get a Ort::Value
///
Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
///
/// \brief Ort::Value to Paddle::ZeroCopyTensor.
///
/// \param[in] value Ort::Value(output Tensor)
///
/// \param[in] desc a ONNXDesce(name、shape、dtype)
///
/// \return get a Ort::Value
///
void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
private:
AnalysisConfig config_;
// ONNXRuntime
Ort::Env env_;
Ort::Session session_{nullptr};
platform::Place place_;
framework::Scope *sub_scope_{nullptr};
std::vector<ONNXDesc> input_desc_;
std::vector<ONNXDesc> output_desc_;
int predictor_id_;
// Some more detailed tests, they are made the friends of the predictor, so that
// the all the details can be tested.
#if PADDLE_WITH_TESTING
FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on);
#endif
};
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_string(dirname, "", "dirname to tests.");
namespace paddle {
TEST(ONNXRuntimePredictor, onnxruntime_on) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname + "/inference.pdmodel",
FLAGS_dirname + "/inference.pdiparams");
config.EnableONNXRuntime();
config.EnableORTOptimization();
config.SetCpuMathLibraryNumThreads(2);
LOG(INFO) << config.Summary();
auto _predictor =
CreatePaddlePredictor<AnalysisConfig,
paddle::PaddleEngineKind::kONNXRuntime>(config);
ASSERT_TRUE(_predictor);
auto* predictor = static_cast<ONNXRuntimePredictor*>(_predictor.get());
ASSERT_TRUE(predictor);
ASSERT_TRUE(!predictor->Clone());
ASSERT_TRUE(predictor->scope_);
ASSERT_TRUE(predictor->sub_scope_);
ASSERT_EQ(predictor->scope_->parent(), nullptr);
ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
// Dummy Input Data
std::vector<int64_t> input_shape = {-1, 3, 224, 224};
std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
std::vector<float> out_data;
out_data.resize(1000);
// testing all interfaces
auto input_names = predictor->GetInputNames();
auto output_names = predictor->GetOutputNames();
auto get_input_shape = predictor->GetInputTensorShape();
ASSERT_EQ(input_names.size(), 1UL);
ASSERT_EQ(output_names.size(), 1UL);
ASSERT_EQ(input_names[0], "inputs");
ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1");
ASSERT_EQ(get_input_shape["inputs"], input_shape);
auto input_tensor = predictor->GetInputTensor(input_names[0]);
input_tensor->Reshape({1, 3, 224, 224});
auto output_tensor = predictor->GetOutputTensor(output_names[0]);
input_tensor->CopyFromCpu(input_data.data());
ASSERT_TRUE(predictor->ZeroCopyRun());
output_tensor->CopyToCpu(out_data.data());
predictor->TryShrinkMemory();
}
} // namespace paddle
......@@ -319,6 +319,18 @@ struct PD_INFER_DECL AnalysisConfig {
///
void EnableNpu(int device_id = 0);
///
/// \brief Turn on ONNXRuntime.
///
void EnableONNXRuntime();
///
/// \brief Turn off ONNXRuntime.
///
void DisableONNXRuntime();
///
/// \brief Turn on ONNXRuntime Optimization.
///
void EnableORTOptimization();
///
/// \brief A boolean state telling whether the GPU is turned on.
///
/// \return bool Whether the GPU is turned on.
......@@ -342,6 +354,19 @@ struct PD_INFER_DECL AnalysisConfig {
///
bool use_ipu() const { return use_ipu_; }
///
/// \brief A boolean state telling whether the ONNXRuntime is turned on.
///
/// \return bool Whether the ONNXRuntime is turned on.
///
bool use_onnxruntime() const { return use_onnxruntime_; }
///
/// \brief A boolean state telling whether the ONNXRuntime Optimization is
/// turned on.
///
/// \return bool Whether the ONNXRuntime Optimization is turned on.
///
bool ort_optimization_enabled() const { return enable_ort_optimization_; }
///
/// \brief Get the GPU device id.
///
/// \return int The GPU device id.
......@@ -841,6 +866,10 @@ struct PD_INFER_DECL AnalysisConfig {
bool use_npu_{false};
int npu_device_id_{0};
// ONNXRuntime related
bool use_onnxruntime_{false};
bool enable_ort_optimization_{false};
// Padding related
bool use_fc_padding_{true};
......
......@@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
private:
friend class AnalysisPredictor;
friend class ONNXRuntimePredictor;
explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {}
};
......@@ -381,6 +382,7 @@ enum class PaddleEngineKind {
kNative = 0, ///< Use the native Fluid facility.
kAutoMixedTensorRT, ///< Automatically mix Fluid with TensorRT.
kAnalysis, ///< More optimization.
kONNXRuntime, ///< Use ONNXRuntime
};
template <typename ConfigT, PaddleEngineKind engine>
......@@ -395,6 +397,11 @@ template <>
PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config);
template <>
PD_INFER_DECL std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
const AnalysisConfig& config);
PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype);
PD_INFER_DECL std::string get_version();
......
......@@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
return config->use_gpu();
}
void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
config->EnableONNXRuntime();
}
void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
config->DisableONNXRuntime();
}
PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->use_onnxruntime();
}
void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
config->EnableORTOptimization();
}
void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
int32_t l3_workspace_size, PD_Bool locked,
PD_Bool autotune, const char* autotune_file,
......
......@@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
__pd_keep PD_Config* pd_config);
///
/// \brief Turn on ONNXRuntime.
///
/// \param[in] pd_onfig config
///
PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime(
__pd_keep PD_Config* pd_config);
///
/// \brief Turn off ONNXRuntime.
///
/// \param[in] pd_onfig config
///
PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime(
__pd_keep PD_Config* pd_config);
///
/// \brief A boolean state telling whether the ONNXRutnime is turned on.
///
/// \return Whether the ONNXRuntime is turned on.
///
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled(
__pd_keep PD_Config* pd_config);
///
/// \brief Turn on ONNXRuntime Optimization.
///
/// \param[in] pd_onfig config
///
PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization(
__pd_keep PD_Config* pd_config);
///
/// \brief Turn on XPU.
///
/// \param[in] pd_onfig config
......
......@@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) {
C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId))
}
///
/// \brief Turn on ONNXRuntime.
///
func (config *Config) EnableONNXRuntime() {
C.PD_ConfigEnableONNXRuntime(config.c)
}
///
/// \brief Turn off ONNXRuntime.
///
func (config *Config) DisableONNXRuntime() {
C.PD_ConfigDisableONNXRuntime(config.c)
}
///
/// \brief A boolean state telling whether the ONNXRuntime is turned on.
///
/// \return bool Whether the ONNXRuntime is turned on.
///
func (config *Config) ONNXRuntimeEnabled() bool {
return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c))
}
///
/// \brief Turn on ONNXRuntime Optimization.
///
func (config *Config) EnableORTOptimization() {
C.PD_ConfigEnableORTOptimization(config.c)
}
///
/// \brief Turn on XPU.
///
......
......@@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) {
config.SetBfloat16Op([]string{"fc", "mul"})
}
func TestONNXRuntime(t *testing.T) {
config := NewConfig()
config.SetModelDir("modelDir")
t.Log(config.ModelDir())
config.EnableONNXRuntime()
t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
config.DisableONNXRuntime()
t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
config.EnableORTOptimization()
config.SetCpuMathLibraryNumThreads(4)
t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads())
}
\ No newline at end of file
......@@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) {
cloned.ClearIntermediateTensor()
}
func TestONNXRuntimePredictor(t *testing.T) {
t.Logf("Version:\n%+v", Version())
config := NewConfig()
config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams")
config.EnableONNXRuntime()
config.EnableORTOptimization()
predictor := NewPredictor(config)
inNames := predictor.GetInputNames()
t.Logf("InputNames:%+v", inNames)
outNames := predictor.GetOutputNames()
t.Logf("OutputNames:%+v", outNames)
inHandle := predictor.GetInputHandle(inNames[0])
inHandle.Reshape([]int32{1, 3, 224, 224})
t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape())
data := make([]float32, numElements([]int32{1, 3, 224, 224}))
for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ {
data[i] = float32(i%255) * 0.1
}
inHandle.CopyFromCpu(data)
t.Logf("inHandle Type:%+v", inHandle.Type())
predictor.Run()
outHandle := predictor.GetOutputHandle(outNames[0])
t.Logf("outHandle name:%+v", outHandle.Name())
outShape := outHandle.Shape()
t.Logf("outHandle Shape:%+v", outShape)
outData := make([]float32, numElements(outShape))
outHandle.CopyToCpu(outData)
t.Log(outData)
}
func TestFromBuffer(t *testing.T) {
modelFile, err := os.Open("./mobilenetv1/inference.pdmodel")
if err != nil {
......
......@@ -22,6 +22,7 @@ fi
# 2. set LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
# 3. go test
go clean -testcache
......
......@@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter {
} // namespace inference
} // namespace paddle
USE_OP(dropout);
USE_OP_ITSELF(dropout);
REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
......@@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
} // namespace inference
} // namespace paddle
USE_OP(relu);
USE_OP_ITSELF(relu);
USE_OP(sigmoid);
USE_OP(tanh);
USE_OP(relu6);
......@@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) {
} // namespace inference
} // namespace paddle
USE_OP(dropout);
USE_OP_ITSELF(dropout);
......@@ -81,6 +81,18 @@ TEST(PD_Config, interface) {
PD_ConfigSetBfloat16Op(config, 1, &ops_name);
#endif
PD_ConfigEnableONNXRuntime(config);
bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config);
#ifdef PADDLE_WITH_ONNXRUNTIME
EXPECT_TRUE(onnxruntime_enabled);
#else
EXPECT_FALSE(onnxruntime_enabled);
#endif
PD_ConfigDisableONNXRuntime(config);
bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config);
EXPECT_FALSE(onnxruntime_disabled);
PD_ConfigEnableORTOptimization(config);
PD_ConfigEnableMemoryOptim(config, true);
bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
EXPECT_TRUE(memory_enabled);
......
......@@ -5,6 +5,7 @@ option(WITH_GPU "Compile demo with GPU/CPU, default use CPU."
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF)
option(USE_TENSORRT "Compile demo with TensorRT." OFF)
option(WITH_GTEST "Compile demo with GTEST" OFF)
option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF)
if(NOT WITH_STATIC_LIB)
add_definitions("-DPADDLE_WITH_SHARED_LIB")
......@@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
link_directories("${PADDLE_LIB}/paddle/lib")
if (WITH_ONNXRUNTIME)
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
endif()
if (WIN32)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
......@@ -172,6 +180,16 @@ else()
endif()
endif()
if (WITH_ONNXRUNTIME)
if(WIN32)
set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
elseif(APPLE)
set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
else()
set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
endif()
endif()
if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
......@@ -248,6 +266,14 @@ if(WIN32)
COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
)
endif()
if(WITH_ONNXRUNTIME)
add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
)
endif()
if(NOT WITH_STATIC_LIB)
add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
......
......@@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas
TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
DATA_DIR=$4 # dataset
TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT
MSVC_STATIC_CRT=$6
WITH_ONNXRUNTIME=$6
MSVC_STATIC_CRT=$7
inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
EXIT_CODE=0 # init default exit code
WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
......@@ -144,7 +145,8 @@ function compile_test() {
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
-DWITH_GTEST=ON \
-DCMAKE_CXX_FLAGS='/std:c++17' \
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_BUILD_TYPE=Release \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj
else
cmake .. -DPADDLE_LIB=${inference_install_dir} \
......@@ -154,7 +156,8 @@ function compile_test() {
-DWITH_STATIC_LIB=OFF \
-DUSE_TENSORRT=$USE_TENSORRT \
-DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-DWITH_GTEST=ON
-DWITH_GTEST=ON \
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
make -j$(nproc)
fi;
cd -
......
......@@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc
endif()
set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
if(WITH_ONNXRUNTIME)
set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2")
if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz)
inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz")
endif()
set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2")
endif()
function (inference_base_test_build TARGET)
set(options "")
set(oneValueArgs "")
......
......@@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
lod_tensor maxouting unpooling pooling lod_rank_table context_project
sequence_pooling segment_pooling executor device_memory_aligment generator)
sequence_pooling executor device_memory_aligment generator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
......
......@@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
......@@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
: CudnnActivationGradFunctor<T>(ctx, 6.0,
GPUDNN_ACTIVATION_CLIPPED_RELU) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
......@@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
......@@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename Functor>
......@@ -197,7 +205,8 @@ class CudnnActivationGradKernel
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override {
static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut,
"Forward deps must be Out.");
const framework::Tensor *X, *Out, *dOut;
X = Out = dOut = nullptr;
......
......@@ -34,7 +34,8 @@ using paddle::framework::Tensor;
template <typename GradFunctor>
static constexpr bool CanInplaceAct() {
return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut ||
GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps;
}
#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \
......@@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
if (ctx->HasOutput("DX")) {
ctx->ShareDim("X", "DX");
ctx->ShareLoD("X", "DX");
......@@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
ctx->ShareLoD("X", "DDOut");
}
}
if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
if (ctx->HasOutput("DOut")) {
ctx->ShareDim("Out", "DOut");
ctx->ShareLoD("Out", "DOut");
......@@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
if (ctx->HasOutput("DDOut")) {
ctx->ShareDim("X", "DDOut");
ctx->ShareLoD("X", "DDOut");
}
}
if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
if (ctx->HasOutput("DDOut")) {
ctx->ShareDim("Out", "DDOut");
ctx->ShareLoD("Out", "DDOut");
......@@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
if (ctx->HasOutput("DX")) {
ctx->ShareDim("X", "DX");
ctx->ShareLoD("X", "DX");
......@@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
ctx->ShareLoD("X", "DDOut");
}
}
if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
if (ctx->HasOutput("D_DOut")) {
ctx->ShareDim("Out", "D_DOut");
ctx->ShareLoD("Out", "D_DOut");
......@@ -1464,6 +1471,18 @@ namespace plat = paddle::platform;
FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor)
REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor);
REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor);
REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor);
REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor);
REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor);
REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor);
REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
/* ========================== sigmoid register =============================
*/
// 1. Register Sigmoid Operator
......@@ -1584,16 +1603,6 @@ REGISTER_OPERATOR(
ops::ActivationOpDoubleGrad2<ops::ReluGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor);
REGISTER_OP_CPU_KERNEL(
relu_grad_grad,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::ReluGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::ReluGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::ReluGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ======================== leaky relu register ============================ */
......
此差异已折叠。
此差异已折叠。
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/argsort_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/argsort_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
......
......@@ -23,7 +23,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -26,7 +26,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册