未验证 提交 0b60f28c 编写于 作者: engineer1109's avatar engineer1109 提交者: GitHub

remove WITH_ASCEND_CL PADDLE_WITH_ASCEND_CL WITH_ASCEND_CXX11 (#52448)

上级 04f8c24e
...@@ -58,10 +58,6 @@ option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) ...@@ -58,10 +58,6 @@ option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF) option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF) option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
...@@ -113,14 +109,6 @@ if(APPLE AND WITH_ARM) ...@@ -113,14 +109,6 @@ if(APPLE AND WITH_ARM)
set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
endif() endif()
if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
if(WITH_ARM_BRPC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
endif()
if(WIN32) if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
...@@ -525,15 +513,6 @@ if(WITH_DISTRIBUTE) ...@@ -525,15 +513,6 @@ if(WITH_DISTRIBUTE)
ON ON
CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE) CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
endif() endif()
if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
# disable WITH_PSCORE for NPU before include third_party
message(
WARNING
"Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
set(WITH_PSCORE
OFF
CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
endif()
if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496) if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
# TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496 # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
message( message(
...@@ -567,13 +546,6 @@ if(WITH_RPC) ...@@ -567,13 +546,6 @@ if(WITH_RPC)
OFF OFF
CACHE BOOL "Disable WITH_RPC when not compiled with distribute" FORCE) CACHE BOOL "Disable WITH_RPC when not compiled with distribute" FORCE)
endif() endif()
if(WITH_ASCEND_CL AND WITH_RPC)
message(
WARNING "Disable WITH_RPC when compiling with NPU. Force WITH_RPC=OFF.")
set(WITH_RPC
OFF
CACHE BOOL "Disable WITH_RPC when compiling with NPU" FORCE)
endif()
if(WITH_ROCM AND WITH_RPC) if(WITH_ROCM AND WITH_RPC)
message( message(
WARNING "Disable WITH_RPC when compiling with ROCM. Force WITH_RPC=OFF.") WARNING "Disable WITH_RPC when compiling with ROCM. Force WITH_RPC=OFF.")
......
...@@ -97,10 +97,6 @@ if(WITH_ASCEND) ...@@ -97,10 +97,6 @@ if(WITH_ASCEND)
add_definitions(-DPADDLE_WITH_ASCEND) add_definitions(-DPADDLE_WITH_ASCEND)
endif() endif()
if(WITH_ASCEND_CL)
add_definitions(-DPADDLE_WITH_ASCEND_CL)
endif()
if(WITH_ASCEND_INT64) if(WITH_ASCEND_INT64)
add_definitions(-DPADDLE_WITH_ASCEND_INT64) add_definitions(-DPADDLE_WITH_ASCEND_INT64)
endif() endif()
......
...@@ -25,111 +25,3 @@ if(EXISTS ...@@ -25,111 +25,3 @@ if(EXISTS
# It means CANN 20.2 + # It means CANN 20.2 +
add_definitions(-DPADDLE_WITH_ASCEND_STRING) add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif() endif()
if(WITH_ASCEND OR WITH_ASCEND_CL)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR}
${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR
${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR}
${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
include_directories(${ATLAS_RUNTIME_INC_DIR})
add_library(ascend_ge SHARED IMPORTED GLOBAL)
set_property(TARGET ascend_ge PROPERTY IMPORTED_LOCATION
${atlas_ge_runner_lib})
add_library(ascend_graph SHARED IMPORTED GLOBAL)
set_property(TARGET ascend_graph PROPERTY IMPORTED_LOCATION
${atlas_graph_lib})
add_library(atlas_acl SHARED IMPORTED GLOBAL)
set_property(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
endif()
if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
include_directories(${FWKACLLIB_INC_DIR})
include_directories(${ACLLIB_INC_DIR})
add_library(ascendcl SHARED IMPORTED GLOBAL)
set_property(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
add_library(ascend_hccl SHARED IMPORTED GLOBAL)
set_property(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
add_library(acl_op_compiler SHARED IMPORTED GLOBAL)
set_property(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION
${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
endif()
if(WITH_ASCEND_CL)
macro(find_ascend_toolkit_version ascend_toolkit_version_info)
file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)"
ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1"
ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION
${ASCEND_TOOLKIT_VERSION})
string(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION)
add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
if(NOT ASCEND_TOOLKIT_VERSION)
set(ASCEND_TOOLKIT_VERSION "???")
else()
message(
STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}")
endif()
endmacro()
macro(find_ascend_driver_version ascend_driver_version_info)
file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS)
string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION
"${ASCEND_DRIVER_VERSION_CONTENTS}")
string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1"
ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}")
if(NOT ASCEND_DRIVER_VERSION)
set(ASCEND_DRIVER_VERSION "???")
else()
message(
STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}")
endif()
endmacro()
if(WITH_ARM)
set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux)
else()
set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux)
endif()
find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info)
find_ascend_driver_version(${ASCEND_DIR}/driver/version.info)
endif()
...@@ -61,44 +61,24 @@ if(CMAKE_COMPILER_IS_GNUCC) ...@@ -61,44 +61,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
endif() endif()
include_directories(${GLOO_INCLUDE_DIR}) include_directories(${GLOO_INCLUDE_DIR})
if(WITH_ASCEND OR WITH_ASCEND_CL) ExternalProject_Add(
ExternalProject_Add( ${GLOO_PROJECT}
${GLOO_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} GIT_REPOSITORY ${GLOO_REPOSITORY}
GIT_REPOSITORY ${GLOO_REPOSITORY} GIT_TAG ${GLOO_TAG}
GIT_TAG ${GLOO_TAG} PREFIX "${GLOO_PREFIX_DIR}"
PREFIX "${GLOO_PREFIX_DIR}" UPDATE_COMMAND ""
UPDATE_COMMAND "" PATCH_COMMAND ${GLOO_PATCH_COMMAND}
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND BUILD_COMMAND
mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake ..
.. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . && -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . && mkdir
mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/glo
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
"${GLOO_INCLUDE_DIR}/gloo" "${GLOO_INCLUDE_DIR}/gloo"
BUILD_BYPRODUCTS ${GLOO_LIBRARIES}) BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
else()
ExternalProject_Add(
${GLOO_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
GIT_REPOSITORY ${GLOO_REPOSITORY}
GIT_TAG ${GLOO_TAG}
PREFIX "${GLOO_PREFIX_DIR}"
UPDATE_COMMAND ""
PATCH_COMMAND ${GLOO_PATCH_COMMAND}
CONFIGURE_COMMAND ""
BUILD_COMMAND
mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake
.. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . &&
mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
"${GLOO_INCLUDE_DIR}/gloo"
BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
endif()
add_library(gloo STATIC IMPORTED GLOBAL) add_library(gloo STATIC IMPORTED GLOBAL)
set_property(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES}) set_property(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
......
...@@ -237,9 +237,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -237,9 +237,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
set(PROTOBUF_TAG v21.12) set(PROTOBUF_TAG v21.12)
elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
set(PROTOBUF_TAG v21.12)
elseif(WITH_IPU) elseif(WITH_IPU)
set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
set(PROTOBUF_TAG v21.12) set(PROTOBUF_TAG v21.12)
...@@ -325,9 +322,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -325,9 +322,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
endif() endif()
endfunction() endfunction()
if(WITH_ASCEND OR WITH_ASCEND_CL) if(WITH_IPU)
set(PROTOBUF_VERSION 21.12)
elseif(WITH_IPU)
set(PROTOBUF_VERSION 21.12) set(PROTOBUF_VERSION 21.12)
elseif(WITH_ARM_BRPC) elseif(WITH_ARM_BRPC)
set(PROTOBUF_VERSION 21.12-baidu-ee-common) set(PROTOBUF_VERSION 21.12-baidu-ee-common)
......
...@@ -15,11 +15,7 @@ ...@@ -15,11 +15,7 @@
include(ExternalProject) include(ExternalProject)
set(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) set(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
if(WITH_ASCEND OR WITH_ASCEND_CL) set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
set(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
else()
set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
endif()
set(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) set(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
set(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) set(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
......
...@@ -64,96 +64,59 @@ else() ...@@ -64,96 +64,59 @@ else()
set(USE_OMP ON) set(USE_OMP ON)
endif() endif()
if(WITH_ASCEND OR WITH_ASCEND_CL) if(WIN32)
ExternalProject_Add( set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
extern_warpctc set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} set(WARPCTC_C_FLAGS_RELEASE
GIT_REPOSITORY ${WARPCTC_REPOSITORY} $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
GIT_TAG ${WARPCTC_TAG} set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
PREFIX ${WARPCTC_PREFIX_DIR} set(WARPCTC_CXX_FLAGS_RELEASE
#UPDATE_COMMAND "" $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
PATCH_COMMAND "" set(WARPCTC_CXX_FLAGS_DEBUG
BUILD_ALWAYS 1 $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU}
-DWITH_ROCM=${WITH_ROCM}
-DWITH_OMP=${USE_OMP}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
else() else()
if(WIN32) set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>) set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
set(WARPCTC_C_FLAGS_DEBUG set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>) set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
set(WARPCTC_C_FLAGS_RELEASE set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>) set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
set(WARPCTC_CXX_FLAGS_RELEASE
$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
set(WARPCTC_CXX_FLAGS_DEBUG
$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
else()
set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
endif()
ExternalProject_Add(
extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
GIT_REPOSITORY ${WARPCTC_REPOSITORY}
GIT_TAG ${WARPCTC_TAG}
PREFIX ${WARPCTC_PREFIX_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
#BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU}
-DWITH_ROCM=${WITH_ROCM}
-DWITH_OMP=${USE_OMP}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
${EXTERNAL_OPTIONAL_ARGS}
${WARPCTC_CCBIN_OPTION}
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
endif() endif()
ExternalProject_Add(
extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
GIT_REPOSITORY ${WARPCTC_REPOSITORY}
GIT_TAG ${WARPCTC_TAG}
PREFIX ${WARPCTC_PREFIX_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
#BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU}
-DWITH_ROCM=${WITH_ROCM}
-DWITH_OMP=${USE_OMP}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
${EXTERNAL_OPTIONAL_ARGS}
${WARPCTC_CCBIN_OPTION}
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
......
...@@ -167,10 +167,6 @@ if(NOT WIN32) ...@@ -167,10 +167,6 @@ if(NOT WIN32)
set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor) set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor)
endif() endif()
if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
endif()
if(NOT APPLE) if(NOT APPLE)
if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM)) if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
set(COMMON_FLAGS set(COMMON_FLAGS
......
...@@ -508,14 +508,9 @@ function(version version_file) ...@@ -508,14 +508,9 @@ function(version version_file)
OUTPUT_VARIABLE PADDLE_GIT_COMMIT) OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
file( file(
WRITE ${version_file} WRITE ${version_file}
"GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "WITH_MKL: ${WITH_MKL}\n"
"WITH_MKL: ${WITH_MKL}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n"
"WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_ROCM: ${WITH_ROCM}\n" "WITH_IPU: ${WITH_IPU}\n")
"WITH_GPU: ${WITH_GPU}\n"
"WITH_ROCM: ${WITH_ROCM}\n"
"WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n"
"WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n"
"WITH_IPU: ${WITH_IPU}\n")
if(WITH_GPU) if(WITH_GPU)
file(APPEND ${version_file} file(APPEND ${version_file}
"CUDA version: ${CUDA_VERSION}\n" "CUDA version: ${CUDA_VERSION}\n"
...@@ -526,11 +521,6 @@ function(version version_file) ...@@ -526,11 +521,6 @@ function(version version_file)
"HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
"MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
endif() endif()
if(WITH_ASCEND_CL)
file(APPEND ${version_file}
"Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n"
"Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n")
endif()
if(WITH_IPU) if(WITH_IPU)
file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n") file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n")
endif() endif()
......
...@@ -74,9 +74,6 @@ function(op_library TARGET) ...@@ -74,9 +74,6 @@ function(op_library TARGET)
set(MKLDNN_FILE) set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer set(op_common_deps operator op_registry math_function layer
common_infer_shape_functions) common_infer_shape_functions)
if(WITH_ASCEND_CL)
set(op_common_deps ${op_common_deps} npu_op_runner)
endif()
if(WITH_MLU) if(WITH_MLU)
set(op_common_deps ${op_common_deps} mlu_baseop) set(op_common_deps ${op_common_deps} mlu_baseop)
endif() endif()
...@@ -175,12 +172,6 @@ function(op_library TARGET) ...@@ -175,12 +172,6 @@ function(op_library TARGET)
list(APPEND xpu_kp_cc_srcs ${TARGET}.kps) list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
endif() endif()
endif() endif()
if(WITH_ASCEND_CL)
string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
endif()
endif()
if(WITH_MLU) if(WITH_MLU)
string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}") string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
...@@ -213,8 +204,6 @@ function(op_library TARGET) ...@@ -213,8 +204,6 @@ function(op_library TARGET)
list(APPEND xpu_kp_cc_srcs ${src}) list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$") elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
list(APPEND xpu_kp_cc_srcs ${src}) list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
list(APPEND npu_cc_srcs ${src})
elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$") elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
list(APPEND mlu_cc_srcs ${src}) list(APPEND mlu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
...@@ -331,13 +320,6 @@ function(op_library TARGET) ...@@ -331,13 +320,6 @@ function(op_library TARGET)
SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
DEPS ${op_library_DEPS} ${op_common_deps}) DEPS ${op_library_DEPS} ${op_common_deps})
else() else()
# deal with CANN version control while registering NPU operators before build
if(WITH_ASCEND_CL)
if(CANN_VERSION LESS 504000)
list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc")
list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc")
endif()
endif()
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
if(WITH_UNITY_BUILD AND op_library_UNITY) if(WITH_UNITY_BUILD AND op_library_UNITY)
# Combine the cc source files. # Combine the cc source files.
...@@ -541,18 +523,6 @@ function(op_library TARGET) ...@@ -541,18 +523,6 @@ function(op_library TARGET)
endforeach() endforeach()
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for NPU
if(WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
foreach(npu_src ${npu_cc_srcs})
set(op_name "")
find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n")
set(pybind_flag 1)
endif()
endforeach()
endif()
# pybind USE_OP_DEVICE_KERNEL for MLU # pybind USE_OP_DEVICE_KERNEL for MLU
if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0) if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
foreach(mlu_src ${mlu_cc_srcs}) foreach(mlu_src ${mlu_cc_srcs})
......
...@@ -394,16 +394,6 @@ if(WITH_BOX_PS) ...@@ -394,16 +394,6 @@ if(WITH_BOX_PS)
list(APPEND third_party_deps extern_box_ps) list(APPEND third_party_deps extern_box_ps)
endif() endif()
if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend)
if(WITH_ASCEND OR WITH_ASCEND_CL)
list(APPEND third_party_deps extern_ascend)
endif()
if(WITH_ASCEND_CL)
list(APPEND third_party_deps extern_ascend_cl)
endif()
endif()
if(WITH_PSCORE) if(WITH_PSCORE)
include(external/snappy) include(external/snappy)
list(APPEND third_party_deps extern_snappy) list(APPEND third_party_deps extern_snappy)
......
...@@ -205,17 +205,10 @@ elseif(WITH_ROCM) ...@@ -205,17 +205,10 @@ elseif(WITH_ROCM)
SRCS fused_broadcast_op_handle.cc SRCS fused_broadcast_op_handle.cc
DEPS broadcast_op_handle) DEPS broadcast_op_handle)
else() else()
if(WITH_ASCEND_CL) cc_library(
cc_library( nan_inf_utils
nan_inf_utils SRCS nan_inf_utils_detail.cc
SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place)
DEPS npu_op_runner framework_proto scope place)
else()
cc_library(
nan_inf_utils
SRCS nan_inf_utils_detail.cc
DEPS framework_proto scope place)
endif()
cc_library( cc_library(
all_reduce_op_handle all_reduce_op_handle
SRCS all_reduce_op_handle.cc SRCS all_reduce_op_handle.cc
......
...@@ -54,12 +54,6 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type, ...@@ -54,12 +54,6 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
} }
} }
#ifdef PADDLE_WITH_ASCEND_CL
void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
const framework::Scope& scope,
const platform::Place& place);
#endif
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -19,8 +19,6 @@ ...@@ -19,8 +19,6 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h"
...@@ -243,40 +241,6 @@ void CheckVarHasNanOrInf(const std::string& op_type, ...@@ -243,40 +241,6 @@ void CheckVarHasNanOrInf(const std::string& op_type,
"phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile " "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
"with XPU.", "with XPU.",
var_name)); var_name));
#endif
return;
} else if (platform::is_npu_place(tensor->place())) {
#ifdef PADDLE_WITH_ASCEND_CL
if (framework::TransToProtoVarType(tensor->dtype()) !=
proto::VarType::FP32) {
return;
}
phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(tensor->dims());
float* cpu_data = static_cast<float*>(
cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype()));
framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
bool flag = false;
for (int i = 0; i < cpu_tensor.numel(); i++) {
if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
flag = true;
break;
}
}
PADDLE_ENFORCE_NE(
flag,
true,
platform::errors::Fatal(
"Operator %s output phi::DenseTensor %s contains Inf.",
op_type,
var_name));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"phi::DenseTensor[%s] use npu place. PaddlePaddle must compile "
"with NPU.",
var_name));
#endif #endif
return; return;
} }
...@@ -309,139 +273,6 @@ bool IsSkipOp(const framework::OperatorBase& op) { ...@@ -309,139 +273,6 @@ bool IsSkipOp(const framework::OperatorBase& op) {
return false; return false;
} }
#ifdef PADDLE_WITH_ASCEND_CL
using NpuOpRunner = paddle::operators::NpuOpRunner;
constexpr int FLOAT_STATUS_SIZE = 8;
static phi::DenseTensor& npu_float_status() {
static phi::DenseTensor float_status;
return float_status;
}
void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
const framework::Scope& scope,
const platform::Place& place) {
if (!platform::is_npu_place(place)) return;
std::call_once(white_list_init_flag, InitWhiteListFormEnv);
if (IsSkipOp(op)) return;
auto* dev_ctx = reinterpret_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
auto& flag = npu_float_status();
flag.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream);
phi::DenseTensor tmp;
tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream);
}
void PrintNpuVarInfo(const std::string& op_type,
const std::string& var_name,
const framework::Variable* var,
const platform::Place& place) {
const phi::DenseTensor* tensor{nullptr};
if (var->IsType<phi::DenseTensor>()) {
tensor = &var->Get<phi::DenseTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
tensor = &var->Get<phi::SelectedRows>().value();
} else {
VLOG(10) << var_name << " var_name need not to check";
return;
}
if ((framework::TransToProtoVarType(tensor->dtype()) !=
proto::VarType::FP32) &&
(framework::TransToProtoVarType(tensor->dtype()) !=
proto::VarType::FP16)) {
return;
}
if (tensor->memory_size() == 0) {
VLOG(10) << var_name << " var_name need not to check, size == 0";
return;
}
VLOG(10) << "begin check " << op_type << " var_name:" << var_name
<< ", place:" << tensor->place() << ", numel:" << tensor->numel();
phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(tensor->dims());
cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype());
framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
LOG(WARNING) << "print [" << var_name << "] tensor info:";
// use env strategy control in future, -1=print_all.
int print_num = 3;
if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
const float* value = cpu_tensor.data<float>();
PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false);
} else if (framework::TransToProtoVarType(tensor->dtype()) ==
proto::VarType::FP16) {
const paddle::platform::float16* value =
cpu_tensor.data<paddle::platform::float16>();
PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false);
}
}
void PrintNPUOpValueInfo(const framework::OperatorBase& op,
const framework::Scope& scope,
const platform::Place& place) {
LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
<< "), here we print some tensor value info of this op.";
for (auto& vname : op.InputVars()) {
auto* var = scope.FindVar(vname);
if (var == nullptr) continue;
PrintNpuVarInfo(op.Type(), vname, var, place);
}
for (auto& vname : op.OutputVars(true)) {
auto* var = scope.FindVar(vname);
if (var == nullptr) continue;
PrintNpuVarInfo(op.Type(), vname, var, place);
}
}
static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& scope,
const platform::Place& place) {
if (!platform::is_npu_place(place)) return;
auto* dev_ctx = reinterpret_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
auto& flag = npu_float_status();
phi::DenseTensor tmp;
tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
// NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder.
NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream);
phi::DenseTensor cpu_tensor;
auto cpu_place = platform::CPUPlace();
float* cpu_data = static_cast<float*>(
cpu_tensor.mutable_data<float>({FLOAT_STATUS_SIZE}, cpu_place));
framework::TensorCopySync(flag, cpu_place, &cpu_tensor);
float sum = 0.0;
for (int i = 0; i < FLOAT_STATUS_SIZE; ++i) {
sum += cpu_data[i];
}
if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);
PADDLE_ENFORCE_LT(sum,
1.0,
platform::errors::PreconditionNotMet(
"Operator %s contains Nan/Inf.", op.Type()));
}
#endif
void CheckOpHasNanOrInf(const framework::OperatorBase& op, void CheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& exec_scope, const framework::Scope& exec_scope,
const platform::Place& place) { const platform::Place& place) {
...@@ -449,13 +280,6 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op, ...@@ -449,13 +280,6 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op,
if (IsSkipOp(op)) return; if (IsSkipOp(op)) return;
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place)) {
NPUCheckOpHasNanOrInf(op, exec_scope, place);
return;
}
#endif
if (op_var_nan_inf_white_list().count(op.Type()) == 0) { if (op_var_nan_inf_white_list().count(op.Type()) == 0) {
// NOTE. vname may destruct in the end of this func. // NOTE. vname may destruct in the end of this func.
for (auto& vname : op.OutputVars(true)) { for (auto& vname : op.OutputVars(true)) {
......
...@@ -674,8 +674,7 @@ class PSGPUWorker : public HogwildWorker { ...@@ -674,8 +674,7 @@ class PSGPUWorker : public HogwildWorker {
}; };
#endif #endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
class SectionWorker : public DeviceWorker { class SectionWorker : public DeviceWorker {
public: public:
SectionWorker() {} SectionWorker() {}
......
...@@ -83,8 +83,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); ...@@ -83,8 +83,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
#endif #endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker); REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -516,23 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -516,23 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle")); platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(
platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
#endif #endif
} else if (platform::is_mlu_place(place_)) { } else if (platform::is_mlu_place(place_)) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
......
...@@ -124,10 +124,3 @@ cc_test( ...@@ -124,10 +124,3 @@ cc_test(
test_fleet_cc test_fleet_cc
SRCS test_fleet.cc SRCS test_fleet.cc
DEPS fleet_wrapper gloo_wrapper fs shell) DEPS fleet_wrapper gloo_wrapper fs shell)
if(WITH_ASCEND OR WITH_ASCEND_CL)
cc_library(
ascend_wrapper
SRCS ascend_wrapper.cc
DEPS framework_proto lod_tensor ascend_ge ascend_graph)
endif()
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
namespace paddle {
namespace framework {
std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
} // end namespace framework
} // end namespace paddle
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <glog/logging.h>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "ge/ge_api.h"
#include "graph/attr_value.h"
#include "graph/tensor.h"
#include "graph/types.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
typedef ge::Graph AscendGraphDesc;
#ifdef PADDLE_WITH_ASCEND_STRING
using AscendString = ge::AscendString;
#else
using AscendString = std::string;
#endif
class AscendInstance {
public:
virtual ~AscendInstance() {}
AscendInstance() {}
std::map<AscendString, AscendString> _GetDefaultInitOptions() {
std::map<AscendString, AscendString> init_options;
init_options["ge.exec.deviceId"] = "0";
init_options["ge.graphRunMode"] = "1";
return init_options;
}
std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
std::map<AscendString, AscendString> init_options;
// init_options["a"] = "b";
// init_options["ge.trainFlag"] = "1";
return init_options;
}
ge::Status InitGEForUT() {
return ge::GEInitialize(_GetDefaultInitOptions());
}
void InitGlobalResouces() {
LOG(INFO) << "Begin ascend InitGlobalResouces";
session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
if (session_ == nullptr) {
PADDLE_THROW(platform::errors::Fatal("new session error: nullptr"));
}
LOG(INFO) << "End ascend InitGlobalResouces";
}
void DestroyGlobalResouces() {
LOG(INFO) << "Begin ascend DestroyGlobalResouces";
session_ = nullptr;
LOG(INFO) << "Begin ascend DestroyGlobalResouces";
}
static std::shared_ptr<AscendInstance> GetInstance() {
if (nullptr == ascend_instance_) {
ascend_instance_.reset(new paddle::framework::AscendInstance());
VLOG(1) << "Initialize AscendInstance Done";
}
return ascend_instance_;
}
void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
ge::Status status = session_->AddGraph(graph_idx, graph);
PADDLE_ENFORCE_EQ(status,
ge::SUCCESS,
paddle::platform::errors::PreconditionNotMet(
"Calling addGraph of graph engine failed, please "
"check Ascend Log."));
VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
}
ge::DataType VarTypeToGeType(proto::VarType::Type type) {
if (type == proto::VarType::FP16) {
return ge::DataType::DT_FLOAT16;
} else if (type == proto::VarType::FP32) {
return ge::DataType::DT_FLOAT;
} else if (type == proto::VarType::FP64) {
return ge::DataType::DT_DOUBLE;
} else if (type == proto::VarType::INT32) {
return ge::DataType::DT_INT32;
} else if (type == proto::VarType::INT64) {
return ge::DataType::DT_INT64;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Not support %s as tensor type.", DataTypeToString(type)));
}
}
int GeTypeSize(proto::VarType::Type type) {
if (type == proto::VarType::FP16) {
return 2;
} else if (type == proto::VarType::FP32) {
return 4;
} else if (type == proto::VarType::FP64) {
return 8;
} else if (type == proto::VarType::INT32) {
return 4;
} else if (type == proto::VarType::INT64) {
return 8;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Not support %s as tensor type.", DataTypeToString(type)));
}
}
ge::Tensor ConvertToGeTensor(const phi::DenseTensor *tensor) {
auto numel = tensor->numel();
std::vector<int64_t> vec_dim;
auto dimen = arity(tensor->dims());
for (auto i = 0; i < dimen; ++i) {
vec_dim.push_back(tensor->dims()[i]);
}
// For Debug
// VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
// ", and shape is";
// for (const auto e : vec_dim) {
// VLOG(0) << e;
// }
ge::Shape shape(vec_dim);
ge::TensorDesc tensor_desc(
shape,
ge::Format::FORMAT_ND,
VarTypeToGeType(framework::TransToProtoVarType(tensor->dtype())));
tensor_desc.SetRealDimCnt(vec_dim.size());
const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor->data());
std::vector<uint8_t> dst(
numel * GeTypeSize(framework::TransToProtoVarType(tensor->dtype())));
memcpy(dst.data(),
data,
GeTypeSize(framework::TransToProtoVarType(tensor->dtype())) * numel);
ge::Tensor ge_tensor(tensor_desc, dst);
return ge_tensor;
}
void RunAscendSubgraph(int graph_idx,
const std::vector<const phi::DenseTensor *> &inputs,
std::vector<phi::DenseTensor *> *outputs) {
VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
// Convert paddle phi::DenseTensor to GE phi::DenseTensor
std::vector<ge::Tensor> ge_inputs;
for (const auto &e : inputs) {
ge_inputs.push_back(ConvertToGeTensor(e));
}
// Run Graph
std::vector<ge::Tensor> ge_outputs;
ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
PADDLE_ENFORCE_EQ(status,
ge::SUCCESS,
paddle::platform::errors::PreconditionNotMet(
"Calling RunGraph of graph engine failed, please "
"check Ascend Log."));
VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
// change tensor back, note all tensor's type computed in GE is uint8
for (size_t i = 0; i < ge_outputs.size(); ++i) {
const uint8_t *ret_data = ge_outputs[i].GetData();
size_t size = ge_outputs[i].GetSize();
VLOG(1) << "GE phi::DenseTensor size of the " << i << "th output var is "
<< size;
auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
platform::CPUPlace());
memcpy(dst, ret_data, size);
// Following for debug:
// VLOG(0) << "output for " << i << " var: ";
// float *tmp = reinterpret_cast<float*>(dst);
// for (size_t j = 0; j < size / 4; ++j) {
// printf("%f ", tmp[j]);
// }
// printf("\n");
}
}
protected:
std::shared_ptr<ge::Session> session_;
private:
static std::shared_ptr<AscendInstance> ascend_instance_;
};
} // namespace framework
} // namespace paddle
#endif
...@@ -125,32 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback( ...@@ -125,32 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
const platform::NPUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void NPUDefaultStreamGarbageCollector::Wait() const {
static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
void NPUDefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
const platform::NPUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void NPUUnsafeFastGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback();
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector( MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector(
const platform::MLUPlace &place, size_t max_memory_size) const platform::MLUPlace &place, size_t max_memory_size)
......
...@@ -139,28 +139,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector { ...@@ -139,28 +139,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
}; };
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUDefaultStreamGarbageCollector : public GarbageCollector {
public:
NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
size_t max_memory_size);
void Wait() const override;
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class NPUUnsafeFastGarbageCollector : public GarbageCollector {
public:
NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
class MLUDefaultStreamGarbageCollector : public GarbageCollector { class MLUDefaultStreamGarbageCollector : public GarbageCollector {
public: public:
......
...@@ -60,11 +60,6 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place, ...@@ -60,11 +60,6 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
if (platform::is_xpu_place(place)) { if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
device_count = phi::backends::xpu::GetXPUDeviceCount(); device_count = phi::backends::xpu::GetXPUDeviceCount();
#endif
}
if (platform::is_npu_place(place)) {
#if defined(PADDLE_WITH_ASCEND_CL)
device_count = platform::GetNPUDeviceCount();
#endif #endif
} }
if (platform::is_ipu_place(place)) { if (platform::is_ipu_place(place)) {
......
...@@ -631,16 +631,6 @@ void BuildOpFuncList(const platform::Place& place, ...@@ -631,16 +631,6 @@ void BuildOpFuncList(const platform::Place& place,
VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope); VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
// values, but only through special `float_status` to checks whether
// the operation is overflow. More about `float_status`, see:
// https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
if (FLAGS_check_nan_inf) {
framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
}
#endif
try { try {
if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) { if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
VLOG(4) << "HandleOperatorBase"; VLOG(4) << "HandleOperatorBase";
......
...@@ -87,16 +87,6 @@ inline void SetDeviceId(const platform::Place& place) { ...@@ -87,16 +87,6 @@ inline void SetDeviceId(const platform::Place& place) {
#else #else
auto dev_id = place.device; auto dev_id = place.device;
platform::SetXPUDeviceId(dev_id); platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_npu_place(place)) {
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with NPU support.",
place));
#else
auto dev_id = place.device;
platform::SetNPUDeviceId(dev_id);
#endif #endif
} else if (platform::is_custom_place(place)) { } else if (platform::is_custom_place(place)) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE #ifndef PADDLE_WITH_CUSTOM_DEVICE
...@@ -218,11 +208,6 @@ void InterpreterCore::RunImpl() { ...@@ -218,11 +208,6 @@ void InterpreterCore::RunImpl() {
async_work_queue_ = GetWorkQueue(); async_work_queue_ = GetWorkQueue();
ExecuteInstructionList(vec_instruction_); ExecuteInstructionList(vec_instruction_);
} }
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
if (platform::is_custom_place(place_)) { if (platform::is_custom_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait(); platform::DeviceContextPool::Instance().Get(place_)->Wait();
...@@ -893,18 +878,6 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) { ...@@ -893,18 +878,6 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) {
: var_scope_.GetMutableScope(); : var_scope_.GetMutableScope();
VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope); VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place)) {
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the
// variable values, but only through special `float_status` to checks
// whether the operation is overflow. More about `float_status`, see:
// https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
if (FLAGS_check_nan_inf) {
framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
}
}
#endif
auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op); auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
{ {
// If it is OperatorBase, InferShape do nothing. // If it is OperatorBase, InferShape do nothing.
......
...@@ -770,16 +770,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -770,16 +770,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else #else
auto dev_id = place.device; auto dev_id = place.device;
platform::SetXPUDeviceId(dev_id); platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_npu_place(place)) {
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with NPU support.",
place));
#else
auto dev_id = place.device;
platform::SetNPUDeviceId(dev_id);
#endif #endif
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
#ifndef PADDLE_WITH_MLU #ifndef PADDLE_WITH_MLU
...@@ -1692,17 +1682,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1692,17 +1682,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
bool fallback_to_cpu = false; bool fallback_to_cpu = false;
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
// values, but only through special `float_status` to checks whether
// the operation is overflow. More about `float_status`, see:
// https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
if (FLAGS_check_nan_inf) {
framework::details::NPUAllocAndClearFloatStatus(*this, scope, place);
}
#endif
// using cache // using cache
if (kernel_type_.get()) { if (kernel_type_.get()) {
dev_ctx = pool.Get(kernel_type_->place_); dev_ctx = pool.Get(kernel_type_->place_);
......
...@@ -553,20 +553,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -553,20 +553,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use IPU device since it's not compiled with IPU," "Paddle can't use IPU device since it's not compiled with IPU,"
"Please recompile or reinstall Paddle with IPU support.")); "Please recompile or reinstall Paddle with IPU support."));
#endif
} else if (platform::is_npu_place(place)) {
#if defined(PADDLE_WITH_ASCEND_CL)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
} else {
gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use NPU device since it's not compiled with "
"NPU,"
"Please recompile or reinstall Paddle with NPU support."));
#endif #endif
} else if (platform::is_custom_place(place)) { } else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE) #if defined(PADDLE_WITH_CUSTOM_DEVICE)
......
...@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key, ...@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_key.backend() == phi::Backend::NPU) {
VLOG(3) << "phi missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << kernel_key
<< ", fallback to CPU one!";
return phi::KernelKey(
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (kernel_key.backend() == phi::Backend::MLU) { if (kernel_key.backend() == phi::Backend::MLU) {
VLOG(3) << "phi missing MLU kernel: " << op.Type() VLOG(3) << "phi missing MLU kernel: " << op.Type()
......
...@@ -12,8 +12,7 @@ ...@@ -12,8 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
...@@ -37,8 +36,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -37,8 +36,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
int place_id = section_config.place_id(); int place_id = section_config.place_id();
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL) #if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
place_ = platform::CUDAPlace(place_id); place_ = platform::CUDAPlace(place_id);
#elif (defined PADDLE_WITH_ASCEND_CL) // NOLINT
place_ = platform::NPUPlace(place_id);
#endif #endif
worker_ = DeviceWorkerFactory::CreateDeviceWorker( worker_ = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name()); trainer_desc.device_worker_name());
......
...@@ -9,8 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,8 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
#include <float.h> #include <float.h>
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
...@@ -235,18 +234,6 @@ void SectionWorker::TrainFiles() { ...@@ -235,18 +234,6 @@ void SectionWorker::TrainFiles() {
gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
} }
} }
#elif defined(PADDLE_WITH_ASCEND_CL)
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
}
#endif #endif
} // max_memory_size >= 0 } // max_memory_size >= 0
......
...@@ -143,35 +143,6 @@ TEST(DenseTensor, MutableData) { ...@@ -143,35 +143,6 @@ TEST(DenseTensor, MutableData) {
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
{
phi::DenseTensor src_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(phi::make_ddim({1, 2, 3}),
platform::NPUPlace(0));
auto p1_holder = src_tensor.Holder();
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(phi::make_ddim({3, 1024}),
platform::NPUPlace(0));
auto p2_holder = src_tensor.Holder();
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1_holder.get(), p2_holder.get());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2, 3}),
platform::NPUPlace(0));
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2}),
platform::NPUPlace(0));
EXPECT_EQ(p1, p2);
}
#endif
} }
TEST(DenseTensor, ShareDataWith) { TEST(DenseTensor, ShareDataWith) {
...@@ -207,16 +178,6 @@ TEST(DenseTensor, ShareDataWith) { ...@@ -207,16 +178,6 @@ TEST(DenseTensor, ShareDataWith) {
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
{
phi::DenseTensor src_tensor;
phi::DenseTensor dst_tensor;
src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}),
platform::NPUPlace(0));
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
} }
TEST(DenseTensor, Slice) { TEST(DenseTensor, Slice) {
...@@ -271,33 +232,6 @@ TEST(DenseTensor, Slice) { ...@@ -271,33 +232,6 @@ TEST(DenseTensor, Slice) {
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
{
phi::DenseTensor src_tensor;
src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
platform::NPUPlace(0));
phi::DenseTensor slice_tensor = src_tensor.Slice(2, 6);
phi::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4);
EXPECT_EQ(slice_dims[1], 9);
uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address =
reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
src_tensor.dims(), platform::NPUPlace(0)));
uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
slice_tensor.dims(), platform::NPUPlace(0)));
EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
}
#endif
} }
TEST(DenseTensor, ReshapeToMatrix) { TEST(DenseTensor, ReshapeToMatrix) {
......
...@@ -125,112 +125,6 @@ void TensorCopyImpl(const TENSOR& src, ...@@ -125,112 +125,6 @@ void TensorCopyImpl(const TENSOR& src,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(zhiqiu): handle different condition like CUDA code below
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
// 1. cpu tensor -> npu pinned tensor
platform::NPUPinnedPlace npu_pinned_place;
phi::DenseTensor npu_pinned_tensor;
npu_pinned_tensor.Resize(src.dims());
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data(npu_pinned_place, src.dtype());
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
dst_place,
dst_ptr,
npu_pinned_place,
npu_pinned_ptr,
size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
else if (platform::is_npu_pinned_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* npu_pinned->npu */
auto src_npu_pinned_place = src_place;
auto dst_npu_place = dst_place;
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(
platform::is_npu_place(ctx_place),
true,
platform::errors::PreconditionNotMet(
"Device context place mismatch. When copying phi::DenseTensor "
"data from NPU Pinned memory to NPU memory, current "
"device context place should be NPU."));
auto ctx_npu_place = ctx_place;
PADDLE_ENFORCE_EQ(dst_npu_place,
ctx_npu_place,
platform::errors::PreconditionNotMet(
"The target NPU device and current device context do "
"not match. The target NPU device number is %d, but "
"device context NPU number is %d.",
dst_npu_place.device,
ctx_npu_place.device));
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(
dst_npu_place, dst_ptr, src_npu_pinned_place, src_ptr, size, stream);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */
auto src_npu_place = src_place;
auto dst_npu_pinned_place = dst_place;
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(
platform::is_npu_place(ctx_place),
true,
platform::errors::PreconditionNotMet(
"Device context place mismatch. When copying phi::DenseTensor "
"data from NPU memory to NPU Pinned memory, current "
"device context place should be NPU."));
auto ctx_npu_place = ctx_place;
PADDLE_ENFORCE_EQ(src_place,
ctx_npu_place,
platform::errors::PreconditionNotMet(
"The source NPU device and current device context do "
"not match. The source NPU device number is %d, but "
"device context NPU number is %d.",
src_npu_place.device,
ctx_npu_place.device));
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(
dst_npu_pinned_place, dst_ptr, src_npu_place, src_ptr, size, stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) { platform::is_cuda_pinned_place(dst_place)) {
...@@ -539,29 +433,6 @@ void TensorCopySync(const phi::DenseTensor& src, ...@@ -539,29 +433,6 @@ void TensorCopySync(const phi::DenseTensor& src,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { /* npu -> cpu*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* cpu -> npu*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* npu -> npu*/
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) { platform::is_cuda_pinned_place(dst_place)) {
...@@ -758,31 +629,6 @@ void TensorToStream(std::ostream& os, ...@@ -758,31 +629,6 @@ void TensorToStream(std::ostream& os,
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU")); "MLUPlace is not supported when not compiled with MLU"));
#endif
} else if (platform::is_npu_place(tensor.place())) {
#ifdef PADDLE_WITH_ASCEND_CL
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& npu_dev_ctx =
static_cast<const platform::NPUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu,
buf.get(),
tensor.place(),
reinterpret_cast<const void*>(data),
size_to_write,
npu_dev_ctx.stream());
npu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
#endif #endif
} else if (platform::is_custom_place(tensor.place())) { } else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
...@@ -875,7 +721,7 @@ void TensorFromStream(std::istream& is, ...@@ -875,7 +721,7 @@ void TensorFromStream(std::istream& is,
platform::is_custom_place(dev_ctx.GetPlace())) { platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) defined(PADDLE_WITH_CUSTOM_DEVICE)
phi::DenseTensor cpu_tensor; phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(phi::make_ddim(shape)); cpu_tensor.Resize(phi::make_ddim(shape));
framework::VisitDataType( framework::VisitDataType(
...@@ -958,7 +804,7 @@ void TensorFromStream(std::istream& is, ...@@ -958,7 +804,7 @@ void TensorFromStream(std::istream& is,
platform::is_custom_place(dev_ctx.GetPlace())) { platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) defined(PADDLE_WITH_CUSTOM_DEVICE)
phi::DenseTensor cpu_tensor; phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(phi::make_ddim(dims)); cpu_tensor.Resize(phi::make_ddim(dims));
framework::VisitDataType( framework::VisitDataType(
......
...@@ -25,9 +25,6 @@ limitations under the License. */ ...@@ -25,9 +25,6 @@ limitations under the License. */
#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h" #include "paddle/fluid/platform/device/mlu/device_context.h"
...@@ -145,37 +142,6 @@ void TensorFromArray(const T* src, ...@@ -145,37 +142,6 @@ void TensorFromArray(const T* src,
reinterpret_cast<const phi::GPUContext&>(ctx).stream()); reinterpret_cast<const phi::GPUContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
// 1. vector -> npu pinned tensor
platform::NPUPinnedPlace npu_pinned_place;
phi::DenseTensor npu_pinned_tensor;
npu_pinned_tensor.Resize(dst->dims());
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
dst_place,
dst_ptr,
npu_pinned_place,
npu_pinned_ptr,
size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(dst_place)) { // NOLINT else if (platform::is_mlu_place(dst_place)) { // NOLINT
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
...@@ -227,42 +193,6 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -227,42 +193,6 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast<const phi::GPUContext&>(ctx).stream()); reinterpret_cast<const phi::GPUContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
// cudaMemcpyAsync.
// cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
// aclrtMemcpyAsync is really "async" between cpu <-> npu.
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else if (platform::is_npu_place(dst_place)) { // NOLINT
// 1. vector -> npu pinned tensor
phi::DenseTensor npu_pinned_tensor(dst->dtype());
platform::NPUPinnedPlace npu_pinned_place;
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
dst_place,
dst_ptr,
npu_pinned_place,
npu_pinned_ptr,
size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(dst_place)) { // NOLINT else if (platform::is_mlu_place(dst_place)) { // NOLINT
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
...@@ -324,37 +254,6 @@ inline void TensorFromVector(const std::vector<bool>& src, ...@@ -324,37 +254,6 @@ inline void TensorFromVector(const std::vector<bool>& src,
reinterpret_cast<const phi::GPUContext&>(ctx).stream()); reinterpret_cast<const phi::GPUContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
// 1. vector -> npu pinned tensor
platform::NPUPinnedPlace npu_pinned_place;
phi::DenseTensor npu_pinned_tensor;
npu_pinned_tensor.Resize(dst->dims());
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
dst_place,
dst_ptr,
npu_pinned_place,
npu_pinned_ptr,
size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT else if (platform::is_custom_place(dst_place)) { // NOLINT
auto stream = auto stream =
...@@ -433,11 +332,6 @@ void TensorToVector(const phi::DenseTensor& src, ...@@ -433,11 +332,6 @@ void TensorToVector(const phi::DenseTensor& src,
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
...@@ -491,11 +385,6 @@ inline void TensorToVector(const phi::DenseTensor& src, ...@@ -491,11 +385,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
...@@ -566,11 +455,6 @@ inline T GetValue(const phi::DenseTensor* x) { ...@@ -566,11 +455,6 @@ inline T GetValue(const phi::DenseTensor* x) {
if (!platform::is_cpu_place(x->place())) { if (!platform::is_cpu_place(x->place())) {
phi::DenseTensor cpu_x; phi::DenseTensor cpu_x;
framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x); framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx = pool.Get(x->place());
dev_ctx->Wait();
#endif
value = cpu_x.data<T>()[0]; value = cpu_x.data<T>()[0];
} else { } else {
value = x->data<T>()[0]; value = x->data<T>()[0];
......
...@@ -299,32 +299,6 @@ TEST(TensorToVector, Tensor_bool) { ...@@ -299,32 +299,6 @@ TEST(TensorToVector, Tensor_bool) {
} }
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
{
std::vector<bool> src_vec = {
false,
true,
false,
true,
false,
true,
false,
true,
false,
};
phi::DenseTensor npu_tensor;
paddle::platform::NPUPlace place(0);
paddle::platform::NPUDeviceContext npu_ctx(place);
paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
std::vector<bool> dst;
paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
for (int i = 0; i < 3 * 3; ++i) {
EXPECT_EQ(src_vec[i], dst[i]);
}
}
#endif
} }
TEST(TensorFromDLPack, Tensor) { TEST(TensorFromDLPack, Tensor) {
......
...@@ -302,8 +302,7 @@ class PSGPUTrainer : public TrainerBase { ...@@ -302,8 +302,7 @@ class PSGPUTrainer : public TrainerBase {
}; };
#endif #endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
class PipelineTrainer : public TrainerBase { class PipelineTrainer : public TrainerBase {
public: public:
PipelineTrainer() {} PipelineTrainer() {}
......
...@@ -82,8 +82,7 @@ REGISTER_TRAINER_CLASS(HeterXpuTrainer); ...@@ -82,8 +82,7 @@ REGISTER_TRAINER_CLASS(HeterXpuTrainer);
(defined PADDLE_WITH_PSLIB) (defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(PSGPUTrainer); REGISTER_TRAINER_CLASS(PSGPUTrainer);
#endif #endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
REGISTER_TRAINER_CLASS(PipelineTrainer); REGISTER_TRAINER_CLASS(PipelineTrainer);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -65,28 +65,6 @@ using Attribute = paddle::variant<paddle::blank, ...@@ -65,28 +65,6 @@ using Attribute = paddle::variant<paddle::blank,
std::vector<paddle::experimental::Scalar>>; std::vector<paddle::experimental::Scalar>>;
using AttributeMap = std::unordered_map<std::string, Attribute>; using AttributeMap = std::unordered_map<std::string, Attribute>;
#ifdef PADDLE_WITH_ASCEND_CL
using NPUAttribute = paddle::variant<paddle::blank,
int,
float,
std::string,
std::vector<int>,
std::vector<float>,
std::vector<std::string>,
bool,
std::vector<bool>,
BlockDesc*,
int64_t,
std::vector<BlockDesc*>,
std::vector<int64_t>,
std::vector<double>,
VarDesc*,
std::vector<VarDesc*>,
std::vector<std::vector<int64_t>>>;
using NPUAttributeMap = std::unordered_map<std::string, NPUAttribute>;
#endif
using OpCreator = using OpCreator =
std::function<OperatorBase*(const std::string& /*type*/, std::function<OperatorBase*(const std::string& /*type*/,
const VariableNameMap& /*inputs*/, const VariableNameMap& /*inputs*/,
......
...@@ -39,11 +39,6 @@ ...@@ -39,11 +39,6 @@
#endif #endif
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include <hccl/hccl.h>
#include <hccl/hccl_types.h>
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h" #include "xpu/bkcl.h"
#endif #endif
...@@ -69,10 +64,6 @@ class Communicator; ...@@ -69,10 +64,6 @@ class Communicator;
class NCCLCommunicator; class NCCLCommunicator;
#endif #endif
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
class Communicator;
class HCCLCommunicator;
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
class BKCLCommunicator; class BKCLCommunicator;
...@@ -205,9 +196,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< ...@@ -205,9 +196,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#endif #endif
operators::CudnnRNNCache, operators::CudnnRNNCache,
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo,
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId, BKCLUniqueId,
platform::BKCLCommunicator, platform::BKCLCommunicator,
......
...@@ -36,49 +36,6 @@ namespace paddle { ...@@ -36,49 +36,6 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
#ifdef PADDLE_WITH_ASCEND_CL
void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
if (!argument->use_npu()) return;
auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params;
if (graph.Has(framework::ir::kRepetitiveParamAttr))
repetitive_params = graph.Get<std::vector<std::string>>(
framework::ir::kRepetitiveParamAttr);
LOG(INFO) << "Sync params from CPU to NPU";
PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(),
true,
platform::errors::PreconditionNotMet(
"The npu_device_id field should be valid"));
platform::Place place = platform::NPUPlace(argument->npu_device_id());
auto *scope = argument->scope_ptr();
std::vector<std::string> all_vars = scope->LocalVarNames();
for (auto &var_name : all_vars) {
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::PreconditionNotMet("The var should not be nullptr"));
if (var->IsType<phi::DenseTensor>()) {
auto *t = var->GetMutable<phi::DenseTensor>();
platform::CPUPlace cpu_place;
phi::DenseTensor temp_tensor;
temp_tensor.Resize(t->dims());
temp_tensor.mutable_data<float>(cpu_place);
paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
t->clear();
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
}
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary. // The parameters are on the cpu, therefore, synchronization is not necessary.
...@@ -253,11 +210,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { ...@@ -253,11 +210,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
argument->scope_valid(), argument->scope_valid(),
true, true,
platform::errors::PreconditionNotMet("The scope field should be valid")); platform::errors::PreconditionNotMet("The scope field should be valid"));
#ifdef PADDLE_WITH_ASCEND_CL
if (argument->use_npu_valid()) {
CopyParamsToNpu(argument);
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (argument->use_gpu_valid()) { if (argument->use_gpu_valid()) {
CopyParamsToGpu(argument); CopyParamsToGpu(argument);
......
...@@ -35,10 +35,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { ...@@ -35,10 +35,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
std::string repr() const override; std::string repr() const override;
private: private:
#ifdef PADDLE_WITH_ASCEND_CL
void CopyParamsToNpu(Argument *argument);
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void CopyParamsToGpu(Argument *argument); void CopyParamsToGpu(Argument *argument);
#endif #endif
......
...@@ -195,21 +195,6 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) { ...@@ -195,21 +195,6 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
Update(); Update();
} }
void AnalysisConfig::EnableNpu(int device_id) {
#if defined(PADDLE_WITH_ASCEND_CL)
use_npu_ = true;
npu_device_id_ = device_id;
#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
use_custom_device_ = true;
custom_device_id_ = device_id;
custom_device_type_ = "npu";
#else
LOG(ERROR) << "Please compile with npu to EnableNpu()";
use_npu_ = false;
#endif
Update();
}
void AnalysisConfig::EnableCustomDevice(const std::string &device_type, void AnalysisConfig::EnableCustomDevice(const std::string &device_type,
int device_id, int device_id,
Precision precision_mode) { Precision precision_mode) {
...@@ -1023,20 +1008,6 @@ void AnalysisConfig::Update() { ...@@ -1023,20 +1008,6 @@ void AnalysisConfig::Update() {
"with XPU-runtime.")); "with XPU-runtime."));
#endif #endif
} }
if (use_npu_) {
#if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU)
PADDLE_ENFORCE_EQ(use_gpu_,
false,
platform::errors::Unavailable(
"Currently, NPU and GPU cannot be enabled in the "
"same analysis configuration."));
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an NPU device, but Paddle was not compiled "
"with NPU-runtime."));
#endif
}
if (use_ipu_) { if (use_ipu_) {
#ifndef PADDLE_WITH_IPU #ifndef PADDLE_WITH_IPU
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
......
...@@ -376,14 +376,6 @@ void AnalysisPredictor::InitPlace() { ...@@ -376,14 +376,6 @@ void AnalysisPredictor::InitPlace() {
"with WITH_XPU.")); "with WITH_XPU."));
#endif // PADDLE_WITH_XPU #endif // PADDLE_WITH_XPU
} }
} else if (config_.use_npu()) {
#ifdef PADDLE_WITH_ASCEND_CL
place_ = paddle::platform::NPUPlace(config_.npu_device_id());
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use NPU forward propagation, but Paddle was not compiled "
"with WITH_ASCEND_CL."));
#endif
} else if (config_.NNAdapter().use_nnadapter) { } else if (config_.NNAdapter().use_nnadapter) {
if (config_.lite_engine_enabled()) { if (config_.lite_engine_enabled()) {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
......
...@@ -278,23 +278,6 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -278,23 +278,6 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Not compile with XPU, should not reach here.")); "Not compile with XPU, should not reach here."));
#endif
} else {
#ifdef PADDLE_WITH_ASCEND_CL
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
auto dst_npu_place = place_;
memory::Copy(dst_npu_place,
static_cast<void *>(input_ptr),
platform::CPUPlace(),
inputs[i].data.data(),
inputs[i].data.length(),
dev_ctx->stream());
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with NPU, should not reach here."));
#endif #endif
} }
......
...@@ -305,15 +305,6 @@ TEST(inference_api_native, image_classification_xpu) { ...@@ -305,15 +305,6 @@ TEST(inference_api_native, image_classification_xpu) {
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(inference_api_native, word2vec_npu) {
MainWord2Vec(paddle::PaddlePlace::kNPU);
}
// TEST(inference_api_native, image_classification_npu) {
// MainImageClassification(paddle::PaddlePlace::kNPU);
// }
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(inference_api_native, word2vec_gpu) { TEST(inference_api_native, word2vec_gpu) {
MainWord2Vec(paddle::PaddlePlace::kGPU); MainWord2Vec(paddle::PaddlePlace::kGPU);
......
...@@ -244,25 +244,6 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -244,25 +244,6 @@ void Tensor::CopyFromCpu(const T *data) {
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with XPU place because paddle is not compiled " "Can not create tensor with XPU place because paddle is not compiled "
"with XPU.")); "with XPU."));
#endif
} else if (place_ == PlaceType::kNPU) {
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::NPUPlace npu_place(device_);
auto *t_data = tensor->mutable_data<T>(npu_place);
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place));
paddle::memory::Copy(npu_place,
static_cast<void *>(t_data),
paddle::platform::CPUPlace(),
data,
ele_size,
dev_ctx->stream());
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with NPU place because paddle is not compiled "
"with NPU."));
#endif #endif
} else { } else {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
...@@ -468,25 +449,6 @@ void Tensor::CopyToCpuImpl(T *data, ...@@ -468,25 +449,6 @@ void Tensor::CopyToCpuImpl(T *data,
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with XPU place because paddle is not compiled " "Can not create tensor with XPU place because paddle is not compiled "
"with XPU.")); "with XPU."));
#endif
} else if (place_ == PlaceType::kNPU) {
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto npu_place = t_place;
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place));
paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data),
npu_place,
t_data,
ele_num * sizeof(T),
dev_ctx->stream());
paddle::platform::NPUStreamSync(dev_ctx->stream());
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with NPU place because paddle is not compiled "
"with NPU."));
#endif #endif
} else { } else {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
......
...@@ -146,10 +146,6 @@ TEST(Tensor, FillRandomDataAndCheck) { ...@@ -146,10 +146,6 @@ TEST(Tensor, FillRandomDataAndCheck) {
ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kGPU)); ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kGPU));
ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kGPU)); ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kGPU));
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kNPU));
ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kNPU));
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kXPU)); ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kXPU));
ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kXPU)); ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kXPU));
......
...@@ -363,12 +363,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -363,12 +363,6 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
void SetXpuDeviceId(int device_id = 0); void SetXpuDeviceId(int device_id = 0);
/// ///
/// \brief Turn on NPU.
///
/// \param device_id device_id the NPU card to use (default is 0).
///
void EnableNpu(int device_id = 0);
///
/// \brief Turn on CustomDevice. /// \brief Turn on CustomDevice.
/// ///
/// \param device_type device_type the custom device to use. /// \param device_type device_type the custom device to use.
......
...@@ -171,11 +171,6 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, ...@@ -171,11 +171,6 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
enable_multi_stream); enable_multi_stream);
} }
void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) {
CHECK_AND_CONVERT_PD_CONFIG;
config->EnableNpu(device_id);
}
PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
return config->use_xpu(); return config->use_xpu();
......
...@@ -214,14 +214,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( ...@@ -214,14 +214,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
PD_Bool adaptive_seqlen, PD_Bool adaptive_seqlen,
PD_Bool enable_multi_stream); PD_Bool enable_multi_stream);
/// ///
/// \brief Turn on NPU.
///
/// \param[in] pd_onfig config
/// \param[in] device_id device_id the NPU card to use.
///
PADDLE_CAPI_EXPORT extern void PD_ConfigEnableNpu(
__pd_keep PD_Config* pd_config, int32_t device_id);
///
/// \brief A boolean state telling whether the XPU is turned on. /// \brief A boolean state telling whether the XPU is turned on.
/// ///
/// \param[in] pd_onfig config /// \param[in] pd_onfig config
......
...@@ -212,15 +212,6 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo ...@@ -212,15 +212,6 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream)) cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
} }
///
/// \brief Turn on NPU.
///
/// \param deviceId the NPU card to use.
///
func (config *Config) EnableNpu(deviceId int32) {
C.PD_ConfigEnableNpu(config.c, C.int32_t(deviceId))
}
/// ///
/// \brief A boolean state telling whether the GPU is turned on. /// \brief A boolean state telling whether the GPU is turned on.
/// ///
......
...@@ -50,11 +50,6 @@ if(UNIX AND NOT APPLE) ...@@ -50,11 +50,6 @@ if(UNIX AND NOT APPLE)
list(APPEND ALLOCATOR_DEPS rt) list(APPEND ALLOCATOR_DEPS rt)
endif() endif()
if(WITH_ASCEND_CL)
list(APPEND ALLOCATOR_SRCS npu_allocator.cc npu_pinned_allocator.cc)
list(APPEND ALLOCATOR_DEPS npu_info)
endif()
if(WITH_CUSTOM_DEVICE) if(WITH_CUSTOM_DEVICE)
list(APPEND ALLOCATOR_SRCS custom_allocator.cc) list(APPEND ALLOCATOR_SRCS custom_allocator.cc)
endif() endif()
......
...@@ -54,10 +54,6 @@ ...@@ -54,10 +54,6 @@
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
...@@ -198,12 +194,6 @@ class AllocatorFacadePrivate { ...@@ -198,12 +194,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
}
InitNaiveBestFitNPUPinnedAllocator();
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
...@@ -254,12 +244,6 @@ class AllocatorFacadePrivate { ...@@ -254,12 +244,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCUDAPinnedAllocator(); InitNaiveBestFitCUDAPinnedAllocator();
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
}
InitNaiveBestFitNPUPinnedAllocator();
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
...@@ -823,17 +807,6 @@ class AllocatorFacadePrivate { ...@@ -823,17 +807,6 @@ class AllocatorFacadePrivate {
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
void InitNaiveBestFitNPUPinnedAllocator() {
allocators_[platform::NPUPinnedPlace()] =
std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) { void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p); allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
...@@ -915,12 +888,6 @@ class AllocatorFacadePrivate { ...@@ -915,12 +888,6 @@ class AllocatorFacadePrivate {
places.emplace_back(platform::XPUPlace(dev_id)); places.emplace_back(platform::XPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
int device_count = platform::GetNPUDeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::NPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
int device_count = platform::GetIPUDeviceCount(); int device_count = platform::GetIPUDeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) { for (int dev_id = 0; dev_id < device_count; ++dev_id) {
...@@ -1107,7 +1074,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, ...@@ -1107,7 +1074,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
} else { } else {
return m->GetAllocator(p, size)->Allocate(size); return m->GetAllocator(p, size)->Allocate(size);
} }
#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_XPU)
return GetAllocator(place)->Allocate(size); return GetAllocator(place)->Allocate(size);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
...@@ -16,9 +16,6 @@ ...@@ -16,9 +16,6 @@
#include <memory> #include <memory>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
...@@ -29,10 +26,6 @@ namespace paddle { ...@@ -29,10 +26,6 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
#ifdef PADDLE_WITH_ASCEND_CL
using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
#endif
// Allocator Facade is the interface exposed to other modules. // Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should // All the configuration or dirty code under development should
// be hidden behind this facade. // be hidden behind this facade.
......
...@@ -19,8 +19,7 @@ limitations under the License. */ ...@@ -19,8 +19,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
#define USE_DEVICE #define USE_DEVICE
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif #endif
...@@ -57,9 +56,6 @@ BuddyAllocator::BuddyAllocator( ...@@ -57,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
init_allocate_size_func_ = &platform::GpuInitAllocSize; init_allocate_size_func_ = &platform::GpuInitAllocSize;
re_allocate_size_func_ = &platform::GpuReallocSize; re_allocate_size_func_ = &platform::GpuReallocSize;
#elif defined(PADDLE_WITH_ASCEND_CL)
init_allocate_size_func_ = &platform::NPUInitAllocSize;
re_allocate_size_func_ = &platform::NPUReallocSize;
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
init_allocate_size_func_ = &platform::MLUInitAllocSize; init_allocate_size_func_ = &platform::MLUInitAllocSize;
re_allocate_size_func_ = &platform::MLUReallocSize; re_allocate_size_func_ = &platform::MLUReallocSize;
...@@ -257,9 +253,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( ...@@ -257,9 +253,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allocate_bytes = DeviceAllocateSize( allocate_bytes = DeviceAllocateSize(
&platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
#elif defined(PADDLE_WITH_ASCEND_CL)
allocate_bytes = DeviceAllocateSize(
&platform::NPUInitAllocSize, &platform::NPUReallocSize, request_bytes);
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
allocate_bytes = DeviceAllocateSize( allocate_bytes = DeviceAllocateSize(
&platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes); &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);
......
...@@ -29,8 +29,7 @@ limitations under the License. */ ...@@ -29,8 +29,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/mlu_info.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
...@@ -396,34 +395,6 @@ TEST(BuddyAllocator, Release) { ...@@ -396,34 +395,6 @@ TEST(BuddyAllocator, Release) {
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(BuddyAllocator, NpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.92;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
buddy_allocator.Release();
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator,
300 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator,
1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
TEST(BuddyAllocator, MluFraction) { TEST(BuddyAllocator, MluFraction) {
// In a 16 GB machine, the pool size will be about 160 MB // In a 16 GB machine, the pool size will be about 160 MB
......
...@@ -213,210 +213,6 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) { ...@@ -213,210 +213,6 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif #endif
} }
// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
constexpr int EXTRA_PADDING_SIZE = 32;
class NPUBuddyAllocatorList {
private:
NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
auto npu_num = devices_.size();
allocators_.resize(npu_num);
init_flags_.reserve(npu_num);
for (size_t i = 0; i < npu_num; ++i) {
init_flags_.emplace_back(new std::once_flag());
}
}
static NPUBuddyAllocatorList *CreateNewInstance() {
return new NPUBuddyAllocatorList();
}
public:
static NPUBuddyAllocatorList *Instance() {
static auto *instance = CreateNewInstance();
return instance;
}
BuddyAllocator *Get(int npu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize(),
EXTRA_PADDING_SIZE));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for GPU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[pos].get();
}
private:
std::vector<int> devices_;
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
return NPUBuddyAllocatorList::Instance()->Get(npu_id);
}
BuddyAllocator *GetNPUPinnedBuddyAllocator() {
static std::once_flag init_flag;
static BuddyAllocator *ba = nullptr;
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUPinnedAllocator),
phi::backends::cpu::NPUPinnedMinChunkSize(),
phi::backends::cpu::NPUPinnedMaxChunkSize());
});
return ba;
}
#endif
template <>
size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::NPUDeviceGuard(place.device);
size_t avail, total;
platform::NPUMemoryUsage(&avail, &total);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
"%s, NpuMaxChunkSize %s, NPU memory used: %s.",
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place))));
} else {
if (FLAGS_init_allocated_mem) {
platform::NPUMemsetSync(ptr, 0xEF, size, size);
}
}
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetNPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUPinnedBuddyAllocator()->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
void *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
LOG(WARNING) << "Cannot allocate " << size << " bytes in NPUPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator()->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::NPUPinnedPlace>(
const platform::NPUPinnedPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUPinnedBuddyAllocator()->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
// For CUDA // For CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class GPUBuddyAllocatorList { class GPUBuddyAllocatorList {
......
...@@ -61,22 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { ...@@ -61,22 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
{
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
}
sleep(10);
alloc.Release(platform::NPUPlace(0));
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
alloc.Release(platform::NPUPlace(0));
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
TEST(NaiveBestFitAllocatorTest, MluAlloc) { TEST(NaiveBestFitAllocatorTest, MluAlloc) {
NaiveBestFitAllocator alloc{platform::MLUPlace(0)}; NaiveBestFitAllocator alloc{platform::MLUPlace(0)};
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/npu_allocator.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
bool NPUAllocator::IsAllocThreadSafe() const { return true; }
void NPUAllocator::FreeImpl(phi::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
allocation->place(),
place_,
platform::errors::PermissionDenied(
"NPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedNPUFree(
allocation->ptr(), allocation->size(), place_.device);
delete allocation;
}
phi::Allocation* NPUAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_,
[this] { platform::SetNPUDeviceId(place_.device); });
void* ptr;
auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
if (LIKELY(result == ACL_ERROR_NONE)) {
return new Allocation(ptr, size, platform::Place(place_));
}
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, place_.device);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
"GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n",
place_.device,
string::HumanReadableSize(size),
place_.device,
string::HumanReadableSize(avail),
place_.device,
err_msg));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class NPUAllocator : public Allocator {
public:
explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(phi::Allocation* allocation) override;
phi::Allocation* AllocateImpl(size_t size) override;
private:
platform::NPUPlace place_;
std::once_flag once_flag_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
void NPUPinnedAllocator::ProcessEventsAndFree() {
for (auto it = npu_events_.begin(); it != npu_events_.end();) {
aclrtEvent event = it->second;
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
platform::NPUEventQuery(event, &status);
if (status == ACL_EVENT_STATUS_COMPLETE) {
auto *allocation = it->first;
void *ptr = allocation->ptr();
free(ptr);
npu_events_.erase(it++);
delete allocation;
platform::NPUEventDestroy(event);
} else {
++it;
}
}
}
phi::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
std::lock_guard<std::mutex> lock(mtx_);
ProcessEventsAndFree();
void *ptr;
int error = posix_memalign(&ptr, kAlignment, size);
PADDLE_ENFORCE_EQ(
error,
0,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size, error code is %d.", size, error));
return new Allocation(ptr, size, platform::NPUPinnedPlace());
}
void NPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
std::lock_guard<std::mutex> lock(mtx_);
void *ptr = allocation->ptr();
auto iter = npu_events_.find(allocation);
// Managed by GC if not called RecordEvent.
if (iter == npu_events_.end()) {
// double free? No such problem has been found so far.
// Or maybe we need a set<Allocation*> to record which
// Allocation managed by GC.
free(ptr);
delete allocation;
return;
}
aclrtEvent event = iter->second;
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
platform::NPUEventQuery(event, &status);
if (status == ACL_EVENT_STATUS_COMPLETE) {
free(ptr);
npu_events_.erase(allocation);
delete allocation;
platform::NPUEventDestroy(event);
}
return;
}
uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
std::lock_guard<std::mutex> lock(mtx_);
// Empty implementation
return static_cast<uint64_t>(0);
}
void NPUPinnedAllocator::RecordEvent(phi::Allocation *allocation,
aclrtStream stream) {
std::lock_guard<std::mutex> lock(mtx_);
aclrtEvent event = nullptr;
platform::NPUEventCreate(&event);
platform::NPUEventRecord(event, stream);
npu_events_.insert({allocation, event});
}
} // namespace allocation
} // namespace memory
} // namespace paddle
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include "acl/acl.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class NPUPinnedAllocator : public Allocator {
public:
bool IsAllocThreadSafe() const override { return true; }
void ProcessEventsAndFree();
void RecordEvent(phi::Allocation *allocation, aclrtStream stream);
constexpr static size_t kAlignment = 4096UL;
protected:
phi::Allocation *AllocateImpl(size_t size) override;
void FreeImpl(phi::Allocation *allocation) override;
uint64_t ReleaseImpl(const platform::Place &place) override;
private:
std::unordered_map<phi::Allocation *, aclrtEvent> npu_events_;
mutable std::mutex mtx_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
#endif
...@@ -287,135 +287,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } ...@@ -287,135 +287,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
void* NPUAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
if (result == ACL_ERROR_NONE) {
*index = 0;
npu_alloc_size_ += size;
return p;
} else {
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, npu_id_);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please try one of the following suggestions:\n"
" 1) Decrease the batch size of your model.\n"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_,
string::HumanReadableSize(size),
npu_id_,
string::HumanReadableSize(avail),
npu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size,
npu_alloc_size_));
npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_);
}
bool NPUAllocator::UseGpu() const { return true; }
void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
size_t usable =
phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB pinned memory."
<< ", available " << usable / 1024.0 / 1024.0 << " MB";
return nullptr;
}
void* p;
// PINNED memory is visible to all NPU contexts.
auto result = platform::NPUHostMalloc(&p, size);
if (result == ACL_ERROR_NONE) {
*index = 1; // PINNED memory
npu_pinnd_alloc_size_ += size;
return p;
} else {
LOG(WARNING) << "NPUHostMalloc failed.";
return nullptr;
}
return nullptr;
}
void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
aclError err;
PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument(
"The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)",
size,
npu_pinnd_alloc_size_));
npu_pinnd_alloc_size_ -= size;
err = platform::NPUHostFree(p);
if (err != ACL_ERROR_NONE) {
PADDLE_ENFORCE_EQ(
err,
0,
platform::errors::Fatal(
"NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
}
}
bool NPUPinnedAllocator::UseGpu() const { return false; }
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
void* MLUAllocator::Alloc(size_t* index, size_t size) { void* MLUAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
......
...@@ -68,32 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator { ...@@ -68,32 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
}; };
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUAllocator : public SystemAllocator {
public:
explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t npu_alloc_size_ = 0;
int npu_id_;
};
class NPUPinnedAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t npu_pinnd_alloc_size_ = 0;
};
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
class MLUAllocator : public SystemAllocator { class MLUAllocator : public SystemAllocator {
public: public:
......
...@@ -83,14 +83,6 @@ TEST(GPUAllocator, AllocFailure) { ...@@ -83,14 +83,6 @@ TEST(GPUAllocator, AllocFailure) {
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NPUAllocator, Alloc) {
paddle::memory::detail::NPUAllocator a(0);
TestAllocator(&a, 1 << 20);
TestAllocator(&a, 1);
}
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
TEST(MLUAllocator, Alloc) { TEST(MLUAllocator, Alloc) {
paddle::memory::detail::MLUAllocator a(0); paddle::memory::detail::MLUAllocator a(0);
......
...@@ -260,415 +260,6 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place, ...@@ -260,415 +260,6 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
template <>
void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event(
"NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event(
"NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
}
}
template <>
void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event(
"NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
platform::RecordEvent record_event(
"NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
}
}
template <>
void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by stream(" << stream << ")";
if (dst_place == src_place) {
platform::SetNPUDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
} else {
if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) {
PADDLE_THROW(platform::errors::Unavailable(
"Peer access between NPU places is not allowed."));
}
if (stream) {
// TODO(zhiqiu): support peer access?
platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
}
}
template <>
void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
platform::CPUPlace dst_place,
void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
platform::NPUPinnedPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
platform::NPUPinnedPlace dst_place,
void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
platform::NPUPinnedPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
}
}
template <>
void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::NPUPlace dst_place,
void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
}
}
// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src;
return Copy(place_dst, dst, place_src, src, num);
} else if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::NPU) {
platform::NPUPlace place_dst(dst_place.GetDeviceId());
platform::CPUPlace place_src;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::NPU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::NPUPlace place_src(src_place.GetDeviceId());
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::NPU &&
dst_place.GetType() == phi::AllocationType::NPU) {
platform::NPUPlace place_src(src_place.GetDeviceId());
platform::NPUPlace place_dst(dst_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::NPUPINNED) {
platform::CPUPlace place_src;
platform::NPUPinnedPlace place_dst;
return Copy(place_dst, dst, place_src, src, num);
} else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst;
platform::NPUPinnedPlace place_src;
return Copy(place_dst, dst, place_src, src, num);
} else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
dst_place.GetType() == phi::AllocationType::NPUPINNED) {
platform::NPUPinnedPlace place_dst;
platform::NPUPinnedPlace place_src;
return Copy(place_dst, dst, place_src, src, num);
} else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
dst_place.GetType() == phi::AllocationType::NPU) {
platform::NPUPinnedPlace place_src;
platform::NPUPlace place_dst(dst_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::NPU &&
dst_place.GetType() == phi::AllocationType::NPUPINNED) {
platform::NPUPinnedPlace place_dst;
platform::NPUPlace place_src(src_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CustomPlace place_src(src_place);
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CustomPlace place_src(src_place);
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
#endif
}
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
template <>
void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
}
// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template <>
void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
void* dst,
phi::NPUPlace src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
template <>
void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template <>
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::NPUPinnedPlace src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
// NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
template <>
void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
}
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
template <>
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::NPUPinnedPlace src_place,
const void* src,
size_t num) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
...@@ -1391,18 +982,6 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, ...@@ -1391,18 +982,6 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT
dst_place.GetType() == phi::AllocationType::NPUPINNED) {
std::memcpy(dst, src, num);
} else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
dst_place.GetType() == phi::AllocationType::CPU) {
std::memcpy(dst, src, num);
} else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
dst_place.GetType() == phi::AllocationType::NPUPINNED) {
std::memcpy(dst, src, num);
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT
dst_place.GetType() == phi::AllocationType::CPU) { dst_place.GetType() == phi::AllocationType::CPU) {
...@@ -1488,8 +1067,7 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, ...@@ -1488,8 +1067,7 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
} }
#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \ #if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) && \ !defined(PADDLE_WITH_HIP)
!defined(PADDLE_WITH_MLU)
template <> template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void Copy<phi::Place, phi::Place>(phi::Place dst_place,
......
...@@ -21,8 +21,7 @@ ...@@ -21,8 +21,7 @@
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/phi/backends/device_memory_aligment.h" #include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h"
......
...@@ -148,16 +148,4 @@ TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { ...@@ -148,16 +148,4 @@ TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
ctx.PartialInitWithAllocator(); ctx.PartialInitWithAllocator();
Compare2<float>(&scope, ctx, "copy_cross_scope"); Compare2<float>(&scope, ctx, "copy_cross_scope");
} }
#elif PADDLE_WITH_ASCEND_CL
TEST(copy_cross_scope, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare1<float>(&scope, ctx, "copy_cross_scope");
}
TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare2<float>(&scope, ctx, "copy_cross_scope");
}
#endif #endif
...@@ -28,15 +28,9 @@ function(detection_library TARGET_NAME) ...@@ -28,15 +28,9 @@ function(detection_library TARGET_NAME)
PARENT_SCOPE) PARENT_SCOPE)
endfunction() endfunction()
if(WITH_ASCEND_CL) detection_library(box_coder_op SRCS box_coder_op.cc)
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op_npu.cc) detection_library(density_prior_box_op SRCS density_prior_box_op.cc
detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
density_prior_box_op.cu density_prior_box_op_npu.cc)
else()
detection_library(box_coder_op SRCS box_coder_op.cc)
detection_library(density_prior_box_op SRCS density_prior_box_op.cc
density_prior_box_op.cu)
endif()
if(WITH_XPU) if(WITH_XPU)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
...@@ -49,11 +43,6 @@ elseif(WITH_MLU) ...@@ -49,11 +43,6 @@ elseif(WITH_MLU)
iou_similarity_op_mlu.cc) iou_similarity_op_mlu.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc) detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
elseif(WITH_ASCEND_CL)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_npu.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc)
else() else()
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu) iou_similarity_op.cu)
......
...@@ -36,13 +36,6 @@ inline std::vector<int> get_expand_times( ...@@ -36,13 +36,6 @@ inline std::vector<int> get_expand_times(
*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
expand_data = cpu_expand_tensor.data<int>(); expand_data = cpu_expand_tensor.data<int>();
} }
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(expand_tensor->place())) {
paddle::framework::TensorCopySync(
*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
expand_data = cpu_expand_tensor.data<int>();
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(expand_tensor->place())) { if (platform::is_xpu_place(expand_tensor->place())) {
paddle::framework::TensorCopySync( paddle::framework::TensorCopySync(
......
...@@ -37,13 +37,6 @@ inline std::vector<int> get_expand_shape( ...@@ -37,13 +37,6 @@ inline std::vector<int> get_expand_shape(
*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>(); shape_data = cpu_shape_tensor.data<int>();
} }
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(shape_tensor->place())) {
paddle::framework::TensorCopySync(
*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>();
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(shape_tensor->place())) { if (platform::is_xpu_place(shape_tensor->place())) {
paddle::framework::TensorCopySync( paddle::framework::TensorCopySync(
...@@ -75,13 +68,6 @@ inline std::vector<int> get_expand_shape( ...@@ -75,13 +68,6 @@ inline std::vector<int> get_expand_shape(
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
vec_epxand_shape.push_back(*temp.data<int32_t>()); vec_epxand_shape.push_back(*temp.data<int32_t>());
} }
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(tensor->place())) { // NOLINT
phi::DenseTensor temp;
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
vec_epxand_shape.push_back(*temp.data<int32_t>());
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(tensor->place())) { // NOLINT else if (platform::is_xpu_place(tensor->place())) { // NOLINT
phi::DenseTensor temp; phi::DenseTensor temp;
......
if(WITH_ASCEND_CL)
cc_library(
beam_search_npu
SRCS beam_search_npu.cc
DEPS npu_op_runner)
endif()
if(WITH_XPU) if(WITH_XPU)
cc_library( cc_library(
beam_search_xpu beam_search_xpu
...@@ -13,9 +6,7 @@ if(WITH_XPU) ...@@ -13,9 +6,7 @@ if(WITH_XPU)
endif() endif()
# please add new math_library in alphabetical order # please add new math_library in alphabetical order
if(WITH_ASCEND_CL) if(WITH_MLU)
math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
elseif(WITH_MLU)
math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
else() else()
math_library(concat_and_split DEPS concat_and_split_functor) math_library(concat_and_split DEPS concat_and_split_functor)
......
...@@ -122,34 +122,6 @@ REGISTER_OPERATOR( ...@@ -122,34 +122,6 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MemcpyD2HInferShapeFunctor); MemcpyD2HInferShapeFunctor);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h,
float,
ops::MemcpyD2HKernel,
double,
ops::MemcpyD2HKernel,
int8_t,
ops::MemcpyD2HKernel,
uint8_t,
ops::MemcpyD2HKernel,
int,
ops::MemcpyD2HKernel,
int64_t,
ops::MemcpyD2HKernel,
bool,
ops::MemcpyD2HKernel,
paddle::platform::bfloat16,
ops::MemcpyD2HKernel,
paddle::platform::complex<float>,
ops::MemcpyD2HKernel,
paddle::platform::complex<double>,
ops::MemcpyD2HKernel,
plat::float16,
ops::MemcpyD2HKernel,
int16_t,
ops::MemcpyD2HKernel);
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_d2h, REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_d2h,
float, float,
......
...@@ -87,11 +87,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -87,11 +87,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
op->SetAttrMap(this->Attrs()); op->SetAttrMap(this->Attrs());
op->SetInput("X", this->Input("X")); op->SetInput("X", this->Input("X"));
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
#ifndef PADDLE_WITH_ASCEND_CL
op->SetInput("Norm", this->Output("Norm")); op->SetInput("Norm", this->Output("Norm"));
#else
op->SetInput("Out", this->Output("Out"));
#endif
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
} }
}; };
......
...@@ -25,9 +25,6 @@ limitations under the License. */ ...@@ -25,9 +25,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h" #include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h"
......
...@@ -248,31 +248,6 @@ void EmplaceDeviceContexts( ...@@ -248,31 +248,6 @@ void EmplaceDeviceContexts(
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please " platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option.")); "re-compile with WITH_IPU option."));
#endif
} else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."));
#endif
} else if (platform::is_npu_pinned_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUPinnedDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."));
#endif #endif
} }
} }
......
...@@ -68,8 +68,6 @@ limitations under the License. */ ...@@ -68,8 +68,6 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/phi/backends/device_ext.h" #include "paddle/phi/backends/device_ext.h"
#include "paddle/phi/backends/stream.h" #include "paddle/phi/backends/stream.h"
...@@ -89,10 +87,6 @@ struct GpuDevice; ...@@ -89,10 +87,6 @@ struct GpuDevice;
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h"
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -150,86 +144,6 @@ namespace xpu = baidu::xpu::api; ...@@ -150,86 +144,6 @@ namespace xpu = baidu::xpu::api;
using XPUDeviceContext = phi::XPUContext; using XPUDeviceContext = phi::XPUContext;
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUDeviceContext
: public DeviceContext,
public phi::TypeInfoTraits<DeviceContext, NPUDeviceContext> {
public:
explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
const Place& GetPlace() const override;
aclrtContext context() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Return npu stream in the device context. */
aclrtStream stream() const;
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return stream_->AddCallback(callback);
}
void WaitStreamCallback() const { return stream_->WaitCallback(); }
#if defined(PADDLE_WITH_ASCEND_CL)
/*! \brief Return hccl communicators. */
HcclComm hccl_comm() const { return hccl_comm_; }
/*! \brief Set hccl communicators. */
void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
#endif
// template <typename Callback>
// void AddStreamCallback(Callback&& callback) const {
// return stream_->AddCallback(callback);
// }
// void WaitStreamCallback() const { return stream_->WaitCallback(); }
static const char* name() { return "NPUDeviceContext"; }
private:
NPUPlace place_;
aclrtContext context_;
#ifdef PADDLE_WITH_ASCEND_CL
// HCCLContext_t hccl_context_;
HcclComm hccl_comm_{nullptr};
#endif
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in NPU
// NOTE(zhiqiu): why need?
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
std::shared_ptr<stream::NPUStream> stream_;
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
};
// Currently, NPUPinnedDeviceContext is only used to data copying.
class NPUPinnedDeviceContext
: public DeviceContext,
public phi::TypeInfoTraits<DeviceContext, NPUPinnedDeviceContext> {
public:
NPUPinnedDeviceContext();
explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const;
static const char* name() { return "NPUPinnedDeviceContext"; }
private:
NPUPinnedPlace place_;
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
using CUDAPinnedDeviceContext = phi::GPUPinnedContext; using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
#endif #endif
...@@ -264,18 +178,6 @@ template <> ...@@ -264,18 +178,6 @@ template <>
struct DefaultDeviceContextType<phi::MLUPlace>; struct DefaultDeviceContextType<phi::MLUPlace>;
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
template <>
struct DefaultDeviceContextType<phi::NPUPlace> {
using TYPE = paddle::platform::NPUDeviceContext;
};
template <>
struct DefaultDeviceContextType<phi::NPUPinnedPlace> {
using TYPE = paddle::platform::NPUPinnedDeviceContext;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <> template <>
struct DefaultDeviceContextType<phi::GPUPinnedPlace> { struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
......
...@@ -38,12 +38,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA) ...@@ -38,12 +38,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA)
USE_EVENT_WAIT(kCPU, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA)
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
USE_EVENT(kNPU);
USE_EVENT_WAIT(kNPU, kNPU)
USE_EVENT_WAIT(kCPU, kNPU)
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
USE_EVENT(kCUSTOM_DEVICE); USE_EVENT(kCUSTOM_DEVICE);
USE_EVENT_WAIT(kCUSTOM_DEVICE, kCUSTOM_DEVICE) USE_EVENT_WAIT(kCUSTOM_DEVICE, kCUSTOM_DEVICE)
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device_event_base.h"
#include "paddle/fluid/platform/event.h"
namespace paddle {
namespace platform {
struct NPUDeviceEventWrapper {
explicit NPUDeviceEventWrapper(const platform::Place& place) {
PADDLE_ENFORCE_EQ(
platform::is_npu_place(place),
true,
platform::errors::PreconditionNotMet(
"Required device shall be NPUPlace, but received %d. ", place));
device_id_ = place.device;
PADDLE_ENFORCE_GT(
device_id_,
-1,
platform::errors::PreconditionNotMet(
"Required DeviceOption.device_id > -1, but received %d. ",
device_id_));
inner_event_ = NpuEventResourcePool::Instance().New(device_id_);
}
std::shared_ptr<NpuEventObject> inner_event_;
int device_id_;
};
void DeviceEventCreateNPU(DeviceEvent* event,
const platform::Place& place,
unsigned int) {
event->InitEvent(std::make_shared<NPUDeviceEventWrapper>(place));
}
void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) {
auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
PADDLE_ENFORCE_NOT_NULL(
npu_dev_ctx,
platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into NPUDeviceContext."));
NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream());
}
bool DeviceEventQueryNPU(const DeviceEvent* event) {
auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
PADDLE_ENFORCE_NOT_NULL(
wrapper,
platform::errors::PreconditionNotMet(
"Failed to dynamic_cast event into NPUDeviceEventWrapper."));
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
platform::NPUEventQuery(wrapper->inner_event_.get(), &status);
return ACL_EVENT_STATUS_COMPLETE == status;
}
void DeviceEventFinishNPU(const DeviceEvent* event) {
auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
NPUEventSynchronize(wrapper->inner_event_.get());
}
void DeviceEventNPUWaitNPU(const DeviceEvent* event,
const DeviceContext* context) {
auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
PADDLE_ENFORCE_NOT_NULL(
npu_dev_ctx,
platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into NPUDeviceContext."));
NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get());
}
void DeviceEventCPUWaitNPU(const DeviceEvent* event,
const DeviceContext* context) {
DeviceEventFinishNPU(event);
}
void DeviceEventSetFinishedNPU(const DeviceEvent* event) {
// do nothing
}
void EventResetNPU(const DeviceEvent* event) {
// do nothing
}
} // namespace platform
} // namespace paddle
using ::paddle::platform::kCPU;
using ::paddle::platform::kNPU;
REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU)
REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU)
REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU)
REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU)
REGISTER_EVENT_SET_FINISHED_FUNCTION(
kNPU, paddle::platform::DeviceEventSetFinishedNPU)
REGISTER_EVENT_WAIT_FUNCTION(kNPU,
kNPU,
paddle::platform::DeviceEventNPUWaitNPU)
REGISTER_EVENT_WAIT_FUNCTION(kCPU,
kNPU,
paddle::platform::DeviceEventCPUWaitNPU)
REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU)
#endif
...@@ -54,7 +54,6 @@ void* GetCUDADsoHandle() { return phi::dynload::GetCUDADsoHandle(); } ...@@ -54,7 +54,6 @@ void* GetCUDADsoHandle() { return phi::dynload::GetCUDADsoHandle(); }
void* GetWarpCTCDsoHandle() { return phi::dynload::GetWarpCTCDsoHandle(); } void* GetWarpCTCDsoHandle() { return phi::dynload::GetWarpCTCDsoHandle(); }
void* GetNCCLDsoHandle() { return phi::dynload::GetNCCLDsoHandle(); } void* GetNCCLDsoHandle() { return phi::dynload::GetNCCLDsoHandle(); }
void* GetHCCLDsoHandle() { return phi::dynload::GetHCCLDsoHandle(); }
void* GetTensorRtDsoHandle() { return phi::dynload::GetTensorRtDsoHandle(); } void* GetTensorRtDsoHandle() { return phi::dynload::GetTensorRtDsoHandle(); }
......
...@@ -37,7 +37,6 @@ void* GetNVRTCDsoHandle(); ...@@ -37,7 +37,6 @@ void* GetNVRTCDsoHandle();
void* GetCUDADsoHandle(); void* GetCUDADsoHandle();
void* GetWarpCTCDsoHandle(); void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle(); void* GetNCCLDsoHandle();
void* GetHCCLDsoHandle();
void* GetTensorRtDsoHandle(); void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle(); void* GetMKLMLDsoHandle();
void* GetLAPACKDsoHandle(); void* GetLAPACKDsoHandle();
......
...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h> #include <arpa/inet.h>
......
...@@ -14,9 +14,8 @@ limitations under the License. */ ...@@ -14,9 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <mutex> #include <mutex>
......
...@@ -187,17 +187,6 @@ void InitDevices() { ...@@ -187,17 +187,6 @@ void InitDevices() {
LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime."; LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
platform::AclInstance::Instance(); // NOLINT
try {
// use user specified XPUs in single-node multi-process mode.
devices = platform::GetSelectedNPUDevices();
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found "
"in runtime.";
}
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
try { try {
// use user specified IPUs. // use user specified IPUs.
......
此差异已折叠。
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
void BindAscendGraph(py::module* m);
void BindAscendWrapper(py::module* m);
void BindAscendDevice(py::module* m);
} // namespace pybind
} // namespace paddle
#endif
...@@ -2616,19 +2616,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -2616,19 +2616,6 @@ void BindImperative(py::module *m_ptr) {
py::arg("ring_id")); py::arg("ring_id"));
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
py::class_<imperative::HCCLParallelContext,
imperative::ParallelContext,
std::shared_ptr<imperative::HCCLParallelContext>>(
m, "HCCLParallelContext")
.def(py::init<const imperative::ParallelStrategy &,
const platform::NPUPlace &>())
.def("init", [](imperative::HCCLParallelContext &self) { self.Init(); })
.def("init_with_ring_id",
&imperative::HCCLParallelContext::InitWithRingID,
py::arg("ring_id"));
#endif
#if defined(PADDLE_WITH_CNCL) #if defined(PADDLE_WITH_CNCL)
py::class_<imperative::CNCLParallelContext, py::class_<imperative::CNCLParallelContext,
imperative::ParallelContext, imperative::ParallelContext,
......
...@@ -772,7 +772,6 @@ void BindAnalysisConfig(py::module *m) { ...@@ -772,7 +772,6 @@ void BindAnalysisConfig(py::module *m) {
py::arg("device_type"), py::arg("device_type"),
py::arg("device_id") = 0, py::arg("device_id") = 0,
py::arg("precision") = AnalysisConfig::Precision::kFloat32) py::arg("precision") = AnalysisConfig::Precision::kFloat32)
.def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
.def("enable_ipu", .def("enable_ipu",
&AnalysisConfig::EnableIpu, &AnalysisConfig::EnableIpu,
py::arg("ipu_device_num") = 1, py::arg("ipu_device_num") = 1,
...@@ -1063,13 +1062,7 @@ void BindPaddleInferPredictor(py::module *m) { ...@@ -1063,13 +1062,7 @@ void BindPaddleInferPredictor(py::module *m) {
.def("get_output_names", &paddle_infer::Predictor::GetOutputNames) .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
.def("get_input_handle", &paddle_infer::Predictor::GetInputHandle) .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
.def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle) .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
.def("run", .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
[](paddle_infer::Predictor &self) {
#ifdef PADDLE_WITH_ASCEND_CL
pybind11::gil_scoped_release release;
#endif
self.Run();
})
.def("clone", .def("clone",
[](paddle_infer::Predictor &self) { return self.Clone(nullptr); }) [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -139,10 +139,6 @@ limitations under the License. */ ...@@ -139,10 +139,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
......
...@@ -139,10 +139,6 @@ limitations under the License. */ ...@@ -139,10 +139,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
...@@ -553,57 +549,14 @@ void BindPlace(pybind11::module &m) { // NOLINT ...@@ -553,57 +549,14 @@ void BindPlace(pybind11::module &m) { // NOLINT
py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC( py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
NPUPlace is a descriptor of a device. NPUPlace is a descriptor of a device.
It represents a NPU device on which a tensor will be allocated and a model will run. It represents a NPU device on which a tensor will be allocated and a model will run.
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: npu # required: npu
import paddle import paddle
place = paddle.NPUPlace(0) place = paddle.NPUPlace(0)
)DOC"); )DOC");
g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr()); g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
npuplace npuplace.def("__init__", [](platform::NPUPlace &self, int dev_id) {})
.def("__init__",
[](platform::NPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_ASCEND_CL
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
if (platform::GetNPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use NPU because there is no NPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d",
dev_id,
platform::GetNPUDeviceCount(),
platform::GetNPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::NPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use NPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use NPU, please try to install NPU version "
"PaddlePaddle by: pip install paddlepaddle-npu\n"
"If you only have CPU, please change NPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::NPUPlace>) .def("_type", &PlaceIndex<platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>) .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>) .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
......
此差异已折叠。
...@@ -139,10 +139,6 @@ limitations under the License. */ ...@@ -139,10 +139,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
......
此差异已折叠。
...@@ -19,9 +19,7 @@ limitations under the License. */ ...@@ -19,9 +19,7 @@ limitations under the License. */
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h" #include "paddle/phi/core/errors.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/phi/backends/npu/npu_info.h"
#endif
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/phi/backends/mlu/mlu_info.h" #include "paddle/phi/backends/mlu/mlu_info.h"
...@@ -44,8 +42,6 @@ inline size_t Alignment(size_t size, ...@@ -44,8 +42,6 @@ inline size_t Alignment(size_t size,
alignment = phi::backends::gpu::GpuMinChunkSize(); alignment = phi::backends::gpu::GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU) #elif defined(PADDLE_WITH_XPU)
alignment = alignment; alignment = alignment;
#elif defined(PADDLE_WITH_ASCEND_CL)
alignment = phi::backends::npu::NPUMinChunkSize();
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
alignment = phi::backends::mlu::MLUMinChunkSize(); alignment = phi::backends::mlu::MLUMinChunkSize();
#else #else
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册