Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
0b60f28c
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0b60f28c
编写于
4月 03, 2023
作者:
engineer1109
提交者:
GitHub
4月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove WITH_ASCEND_CL PADDLE_WITH_ASCEND_CL WITH_ASCEND_CXX11 (#52448)
上级
04f8c24e
变更
103
展开全部
隐藏空白更改
内联
并排
Showing
103 changed file
with
140 addition
and
4400 deletion
+140
-4400
CMakeLists.txt
CMakeLists.txt
+0
-28
cmake/configure.cmake
cmake/configure.cmake
+0
-4
cmake/external/ascend.cmake
cmake/external/ascend.cmake
+0
-108
cmake/external/gloo.cmake
cmake/external/gloo.cmake
+18
-38
cmake/external/protobuf.cmake
cmake/external/protobuf.cmake
+1
-6
cmake/external/threadpool.cmake
cmake/external/threadpool.cmake
+1
-5
cmake/external/warpctc.cmake
cmake/external/warpctc.cmake
+51
-88
cmake/flags.cmake
cmake/flags.cmake
+0
-4
cmake/inference_lib.cmake
cmake/inference_lib.cmake
+3
-13
cmake/operators.cmake
cmake/operators.cmake
+0
-30
cmake/third_party.cmake
cmake/third_party.cmake
+0
-10
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+4
-11
paddle/fluid/framework/details/nan_inf_utils.h
paddle/fluid/framework/details/nan_inf_utils.h
+0
-6
paddle/fluid/framework/details/nan_inf_utils_detail.cc
paddle/fluid/framework/details/nan_inf_utils_detail.cc
+0
-176
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+1
-2
paddle/fluid/framework/device_worker_factory.cc
paddle/fluid/framework/device_worker_factory.cc
+1
-2
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+0
-17
paddle/fluid/framework/fleet/CMakeLists.txt
paddle/fluid/framework/fleet/CMakeLists.txt
+0
-7
paddle/fluid/framework/fleet/ascend_wrapper.cc
paddle/fluid/framework/fleet/ascend_wrapper.cc
+0
-22
paddle/fluid/framework/fleet/ascend_wrapper.h
paddle/fluid/framework/fleet/ascend_wrapper.h
+0
-214
paddle/fluid/framework/garbage_collector.cc
paddle/fluid/framework/garbage_collector.cc
+0
-26
paddle/fluid/framework/garbage_collector.h
paddle/fluid/framework/garbage_collector.h
+0
-22
paddle/fluid/framework/new_executor/interpreter/execution_config.cc
...id/framework/new_executor/interpreter/execution_config.cc
+0
-5
paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
...id/framework/new_executor/interpreter/interpreter_util.cc
+0
-10
paddle/fluid/framework/new_executor/interpretercore.cc
paddle/fluid/framework/new_executor/interpretercore.cc
+0
-27
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+0
-21
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+0
-14
paddle/fluid/framework/phi_utils.cc
paddle/fluid/framework/phi_utils.cc
+0
-9
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+1
-4
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+1
-14
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+0
-66
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+2
-156
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+0
-116
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+0
-26
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+1
-2
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+1
-2
paddle/fluid/framework/type_defs.h
paddle/fluid/framework/type_defs.h
+0
-22
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+0
-12
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
...ence/analysis/passes/ir_params_sync_among_devices_pass.cc
+0
-48
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
...rence/analysis/passes/ir_params_sync_among_devices_pass.h
+0
-4
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+0
-29
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+0
-8
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+0
-17
paddle/fluid/inference/api/api_impl_tester.cc
paddle/fluid/inference/api/api_impl_tester.cc
+0
-9
paddle/fluid/inference/api/details/zero_copy_tensor.cc
paddle/fluid/inference/api/details/zero_copy_tensor.cc
+0
-38
paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
+0
-4
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+0
-6
paddle/fluid/inference/capi_exp/pd_config.cc
paddle/fluid/inference/capi_exp/pd_config.cc
+0
-5
paddle/fluid/inference/capi_exp/pd_config.h
paddle/fluid/inference/capi_exp/pd_config.h
+0
-8
paddle/fluid/inference/goapi/config.go
paddle/fluid/inference/goapi/config.go
+0
-9
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+0
-5
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+1
-34
paddle/fluid/memory/allocation/allocator_facade.h
paddle/fluid/memory/allocation/allocator_facade.h
+0
-7
paddle/fluid/memory/allocation/buddy_allocator.cc
paddle/fluid/memory/allocation/buddy_allocator.cc
+1
-8
paddle/fluid/memory/allocation/buddy_allocator_test.cc
paddle/fluid/memory/allocation/buddy_allocator_test.cc
+1
-30
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+0
-204
paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
.../fluid/memory/allocation/naive_best_fit_allocator_test.cc
+0
-16
paddle/fluid/memory/allocation/npu_allocator.cc
paddle/fluid/memory/allocation/npu_allocator.cc
+0
-80
paddle/fluid/memory/allocation/npu_allocator.h
paddle/fluid/memory/allocation/npu_allocator.h
+0
-42
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+0
-99
paddle/fluid/memory/allocation/npu_pinned_allocator.h
paddle/fluid/memory/allocation/npu_pinned_allocator.h
+0
-51
paddle/fluid/memory/allocation/system_allocator.cc
paddle/fluid/memory/allocation/system_allocator.cc
+0
-129
paddle/fluid/memory/allocation/system_allocator.h
paddle/fluid/memory/allocation/system_allocator.h
+0
-26
paddle/fluid/memory/allocation/system_allocator_test.cc
paddle/fluid/memory/allocation/system_allocator_test.cc
+0
-8
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+1
-423
paddle/fluid/operators/coalesce_tensor_op.cc
paddle/fluid/operators/coalesce_tensor_op.cc
+1
-2
paddle/fluid/operators/copy_cross_scope_test.cc
paddle/fluid/operators/copy_cross_scope_test.cc
+0
-12
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+3
-14
paddle/fluid/operators/expand_op.h
paddle/fluid/operators/expand_op.h
+0
-7
paddle/fluid/operators/expand_v2_op.h
paddle/fluid/operators/expand_v2_op.h
+0
-14
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-10
paddle/fluid/operators/memcpy_d2h_op.cc
paddle/fluid/operators/memcpy_d2h_op.cc
+0
-28
paddle/fluid/operators/norm_op.cc
paddle/fluid/operators/norm_op.cc
+0
-4
paddle/fluid/platform/device/device_wrapper.h
paddle/fluid/platform/device/device_wrapper.h
+0
-3
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+0
-25
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+0
-98
paddle/fluid/platform/device_event.h
paddle/fluid/platform/device_event.h
+0
-6
paddle/fluid/platform/device_event_npu.cc
paddle/fluid/platform/device_event_npu.cc
+0
-116
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+0
-1
paddle/fluid/platform/dynload/dynamic_loader.h
paddle/fluid/platform/dynload/dynamic_loader.h
+0
-1
paddle/fluid/platform/gen_comm_id_helper.cc
paddle/fluid/platform/gen_comm_id_helper.cc
+2
-3
paddle/fluid/platform/gen_comm_id_helper.h
paddle/fluid/platform/gen_comm_id_helper.h
+2
-3
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+0
-11
paddle/fluid/pybind/ascend_wrapper_py.cc
paddle/fluid/pybind/ascend_wrapper_py.cc
+0
-917
paddle/fluid/pybind/ascend_wrapper_py.h
paddle/fluid/pybind/ascend_wrapper_py.h
+0
-32
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+0
-13
paddle/fluid/pybind/inference_api.cc
paddle/fluid/pybind/inference_api.cc
+1
-8
paddle/fluid/pybind/parallel_executor.cc
paddle/fluid/pybind/parallel_executor.cc
+0
-4
paddle/fluid/pybind/place.cc
paddle/fluid/pybind/place.cc
+1
-48
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-48
paddle/fluid/pybind/tensor.cc
paddle/fluid/pybind/tensor.cc
+0
-4
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+0
-62
paddle/phi/backends/device_memory_aligment.h
paddle/phi/backends/device_memory_aligment.h
+1
-5
paddle/phi/backends/dynload/CMakeLists.txt
paddle/phi/backends/dynload/CMakeLists.txt
+0
-5
paddle/phi/backends/dynload/dynamic_loader.cc
paddle/phi/backends/dynload/dynamic_loader.cc
+0
-18
paddle/phi/backends/dynload/dynamic_loader.h
paddle/phi/backends/dynload/dynamic_loader.h
+0
-1
paddle/phi/backends/npu/npu_info.h
paddle/phi/backends/npu/npu_info.h
+0
-36
paddle/phi/core/flags.cc
paddle/phi/core/flags.cc
+4
-38
paddle/phi/core/utils/visit_place.h
paddle/phi/core/utils/visit_place.h
+0
-20
paddle/phi/kernels/funcs/interpolate_function.h
paddle/phi/kernels/funcs/interpolate_function.h
+0
-7
test/CMakeLists.txt
test/CMakeLists.txt
+16
-40
test/amp/CMakeLists.txt
test/amp/CMakeLists.txt
+16
-40
test/asp/CMakeLists.txt
test/asp/CMakeLists.txt
+1
-4
未找到文件。
CMakeLists.txt
浏览文件 @
0b60f28c
...
@@ -58,10 +58,6 @@ option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
...
@@ -58,10 +58,6 @@ option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option
(
WITH_ASCEND
"Compile PaddlePaddle with ASCEND"
OFF
)
option
(
WITH_ASCEND
"Compile PaddlePaddle with ASCEND"
OFF
)
option
(
WITH_ROCM
"Compile PaddlePaddle with ROCM platform"
OFF
)
option
(
WITH_ROCM
"Compile PaddlePaddle with ROCM platform"
OFF
)
option
(
WITH_IPU
"Compile PaddlePaddle with Graphcore IPU"
OFF
)
option
(
WITH_IPU
"Compile PaddlePaddle with Graphcore IPU"
OFF
)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option
(
WITH_ASCEND_CL
"Compile PaddlePaddle with ASCEND CL"
${
WITH_ASCEND
}
)
option
(
WITH_ASCEND_CXX11
"Compile PaddlePaddle with ASCEND and CXX11 ABI"
OFF
)
option
(
WITH_ONNXRUNTIME
"Compile PaddlePaddle with ONNXRUNTIME"
OFF
)
option
(
WITH_ONNXRUNTIME
"Compile PaddlePaddle with ONNXRUNTIME"
OFF
)
option
(
WITH_CUSPARSELT
"Compile PaddlePaddle with CUSPARSELT"
OFF
)
option
(
WITH_CUSPARSELT
"Compile PaddlePaddle with CUSPARSELT"
OFF
)
option
(
WITH_SETUP_INSTALL
"Compile PaddlePaddle with setup.py"
OFF
)
option
(
WITH_SETUP_INSTALL
"Compile PaddlePaddle with setup.py"
OFF
)
...
@@ -113,14 +109,6 @@ if(APPLE AND WITH_ARM)
...
@@ -113,14 +109,6 @@ if(APPLE AND WITH_ARM)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_C_FLAGS
}
-target arm64-apple-darwin"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_C_FLAGS
}
-target arm64-apple-darwin"
)
endif
()
endif
()
if
(
WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11
)
if
(
WITH_ARM_BRPC
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-D_GLIBCXX_USE_CXX11_ABI=1"
)
else
()
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-D_GLIBCXX_USE_CXX11_ABI=0"
)
endif
()
endif
()
if
(
WIN32
)
if
(
WIN32
)
option
(
MSVC_STATIC_CRT
"use static C Runtime library by default"
ON
)
option
(
MSVC_STATIC_CRT
"use static C Runtime library by default"
ON
)
...
@@ -525,15 +513,6 @@ if(WITH_DISTRIBUTE)
...
@@ -525,15 +513,6 @@ if(WITH_DISTRIBUTE)
ON
ON
CACHE STRING
"Enable GLOO when compiling WITH_DISTRIBUTE=ON."
FORCE
)
CACHE STRING
"Enable GLOO when compiling WITH_DISTRIBUTE=ON."
FORCE
)
endif
()
endif
()
if
(
WITH_ASCEND_CL AND NOT WITH_ARM_BRPC
)
# disable WITH_PSCORE for NPU before include third_party
message
(
WARNING
"Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF."
)
set
(
WITH_PSCORE
OFF
CACHE BOOL
"Disable WITH_PSCORE when compiling with NPU"
FORCE
)
endif
()
if
(
WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496
)
if
(
WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496
)
# TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
# TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
message
(
message
(
...
@@ -567,13 +546,6 @@ if(WITH_RPC)
...
@@ -567,13 +546,6 @@ if(WITH_RPC)
OFF
OFF
CACHE BOOL
"Disable WITH_RPC when not compiled with distribute"
FORCE
)
CACHE BOOL
"Disable WITH_RPC when not compiled with distribute"
FORCE
)
endif
()
endif
()
if
(
WITH_ASCEND_CL AND WITH_RPC
)
message
(
WARNING
"Disable WITH_RPC when compiling with NPU. Force WITH_RPC=OFF."
)
set
(
WITH_RPC
OFF
CACHE BOOL
"Disable WITH_RPC when compiling with NPU"
FORCE
)
endif
()
if
(
WITH_ROCM AND WITH_RPC
)
if
(
WITH_ROCM AND WITH_RPC
)
message
(
message
(
WARNING
"Disable WITH_RPC when compiling with ROCM. Force WITH_RPC=OFF."
)
WARNING
"Disable WITH_RPC when compiling with ROCM. Force WITH_RPC=OFF."
)
...
...
cmake/configure.cmake
浏览文件 @
0b60f28c
...
@@ -97,10 +97,6 @@ if(WITH_ASCEND)
...
@@ -97,10 +97,6 @@ if(WITH_ASCEND)
add_definitions
(
-DPADDLE_WITH_ASCEND
)
add_definitions
(
-DPADDLE_WITH_ASCEND
)
endif
()
endif
()
if
(
WITH_ASCEND_CL
)
add_definitions
(
-DPADDLE_WITH_ASCEND_CL
)
endif
()
if
(
WITH_ASCEND_INT64
)
if
(
WITH_ASCEND_INT64
)
add_definitions
(
-DPADDLE_WITH_ASCEND_INT64
)
add_definitions
(
-DPADDLE_WITH_ASCEND_INT64
)
endif
()
endif
()
...
...
cmake/external/ascend.cmake
浏览文件 @
0b60f28c
...
@@ -25,111 +25,3 @@ if(EXISTS
...
@@ -25,111 +25,3 @@ if(EXISTS
# It means CANN 20.2 +
# It means CANN 20.2 +
add_definitions
(
-DPADDLE_WITH_ASCEND_STRING
)
add_definitions
(
-DPADDLE_WITH_ASCEND_STRING
)
endif
()
endif
()
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
set
(
ASCEND_DRIVER_DIR
${
ASCEND_DIR
}
/driver/lib64
)
set
(
ASCEND_DRIVER_COMMON_DIR
${
ASCEND_DIR
}
/driver/lib64/common
)
set
(
ASCEND_DRIVER_SHARE_DIR
${
ASCEND_DIR
}
/driver/lib64/share
)
set
(
ASCEND_RUNTIME_DIR
${
ASCEND_DIR
}
/fwkacllib/lib64
)
set
(
ASCEND_ATC_DIR
${
ASCEND_DIR
}
/atc/lib64
)
set
(
ASCEND_ACL_DIR
${
ASCEND_DIR
}
/acllib/lib64
)
set
(
STATIC_ACL_LIB
${
ASCEND_ACL_DIR
}
)
set
(
ASCEND_MS_RUNTIME_PATH
${
ASCEND_RUNTIME_DIR
}
${
ASCEND_ACL_DIR
}
${
ASCEND_ATC_DIR
}
)
set
(
ASCEND_MS_DRIVER_PATH
${
ASCEND_DRIVER_DIR
}
${
ASCEND_DRIVER_COMMON_DIR
}
)
set
(
ATLAS_RUNTIME_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64
)
set
(
ATLAS_RUNTIME_INC_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/include
)
set
(
ATLAS_ACL_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/acllib/lib64
)
set
(
ATLAS_ATC_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/atc/lib64
)
set
(
ATLAS_MS_RUNTIME_PATH
${
ATLAS_RUNTIME_DIR
}
${
ATLAS_ACL_DIR
}
${
ATLAS_ATC_DIR
}
)
set
(
atlas_graph_lib
${
ATLAS_RUNTIME_DIR
}
/libgraph.so
)
set
(
atlas_ge_runner_lib
${
ATLAS_RUNTIME_DIR
}
/libge_runner.so
)
set
(
atlas_acl_lib
${
ATLAS_RUNTIME_DIR
}
/libascendcl.so
)
include_directories
(
${
ATLAS_RUNTIME_INC_DIR
}
)
add_library
(
ascend_ge SHARED IMPORTED GLOBAL
)
set_property
(
TARGET ascend_ge PROPERTY IMPORTED_LOCATION
${
atlas_ge_runner_lib
}
)
add_library
(
ascend_graph SHARED IMPORTED GLOBAL
)
set_property
(
TARGET ascend_graph PROPERTY IMPORTED_LOCATION
${
atlas_graph_lib
}
)
add_library
(
atlas_acl SHARED IMPORTED GLOBAL
)
set_property
(
TARGET atlas_acl PROPERTY IMPORTED_LOCATION
${
atlas_acl_lib
}
)
add_custom_target
(
extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl
)
endif
()
if
(
WITH_ASCEND_CL
)
set
(
ASCEND_CL_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64
)
set
(
ascend_hccl_lib
${
ASCEND_CL_DIR
}
/libhccl.so
)
set
(
ascendcl_lib
${
ASCEND_CL_DIR
}
/libascendcl.so
)
set
(
acl_op_compiler_lib
${
ASCEND_CL_DIR
}
/libacl_op_compiler.so
)
set
(
FWKACLLIB_INC_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/include
)
set
(
ACLLIB_INC_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/acllib/include
)
message
(
STATUS
"FWKACLLIB_INC_DIR
${
FWKACLLIB_INC_DIR
}
"
)
message
(
STATUS
"ASCEND_CL_DIR
${
ASCEND_CL_DIR
}
"
)
include_directories
(
${
FWKACLLIB_INC_DIR
}
)
include_directories
(
${
ACLLIB_INC_DIR
}
)
add_library
(
ascendcl SHARED IMPORTED GLOBAL
)
set_property
(
TARGET ascendcl PROPERTY IMPORTED_LOCATION
${
ascendcl_lib
}
)
add_library
(
ascend_hccl SHARED IMPORTED GLOBAL
)
set_property
(
TARGET ascend_hccl PROPERTY IMPORTED_LOCATION
${
ascend_hccl_lib
}
)
add_library
(
acl_op_compiler SHARED IMPORTED GLOBAL
)
set_property
(
TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION
${
acl_op_compiler_lib
}
)
add_custom_target
(
extern_ascend_cl DEPENDS ascendcl acl_op_compiler
)
endif
()
if
(
WITH_ASCEND_CL
)
macro
(
find_ascend_toolkit_version ascend_toolkit_version_info
)
file
(
READ
${
ascend_toolkit_version_info
}
ASCEND_TOOLKIT_VERSION_CONTENTS
)
string
(
REGEX MATCH
"version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)"
ASCEND_TOOLKIT_VERSION
"
${
ASCEND_TOOLKIT_VERSION_CONTENTS
}
"
)
string
(
REGEX REPLACE
"version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)"
"
\\
1"
ASCEND_TOOLKIT_VERSION
"
${
ASCEND_TOOLKIT_VERSION
}
"
)
string
(
REGEX REPLACE
"[A-Z]|[a-z|\.]"
""
CANN_VERSION
${
ASCEND_TOOLKIT_VERSION
}
)
string
(
SUBSTRING
"
${
CANN_VERSION
}
000"
0 6 CANN_VERSION
)
add_definitions
(
"-DCANN_VERSION_CODE=
${
CANN_VERSION
}
"
)
if
(
NOT ASCEND_TOOLKIT_VERSION
)
set
(
ASCEND_TOOLKIT_VERSION
"???"
)
else
()
message
(
STATUS
"Current Ascend Toolkit version is
${
ASCEND_TOOLKIT_VERSION
}
"
)
endif
()
endmacro
()
macro
(
find_ascend_driver_version ascend_driver_version_info
)
file
(
READ
${
ascend_driver_version_info
}
ASCEND_DRIVER_VERSION_CONTENTS
)
string
(
REGEX MATCH
"Version=([0-9]+\.[0-9]+\.[0-9]+)"
ASCEND_DRIVER_VERSION
"
${
ASCEND_DRIVER_VERSION_CONTENTS
}
"
)
string
(
REGEX REPLACE
"Version=([0-9]+\.[0-9]+\.[0-9]+)"
"
\\
1"
ASCEND_DRIVER_VERSION
"
${
ASCEND_DRIVER_VERSION
}
"
)
if
(
NOT ASCEND_DRIVER_VERSION
)
set
(
ASCEND_DRIVER_VERSION
"???"
)
else
()
message
(
STATUS
"Current Ascend Driver version is
${
ASCEND_DRIVER_VERSION
}
"
)
endif
()
endmacro
()
if
(
WITH_ARM
)
set
(
ASCEND_TOOLKIT_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/arm64-linux
)
else
()
set
(
ASCEND_TOOLKIT_DIR
${
ASCEND_DIR
}
/ascend-toolkit/latest/x86_64-linux
)
endif
()
find_ascend_toolkit_version
(
${
ASCEND_TOOLKIT_DIR
}
/ascend_toolkit_install.info
)
find_ascend_driver_version
(
${
ASCEND_DIR
}
/driver/version.info
)
endif
()
cmake/external/gloo.cmake
浏览文件 @
0b60f28c
...
@@ -61,44 +61,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
...
@@ -61,44 +61,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
endif
()
endif
()
include_directories
(
${
GLOO_INCLUDE_DIR
}
)
include_directories
(
${
GLOO_INCLUDE_DIR
}
)
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
ExternalProject_Add
(
ExternalProject_Add
(
${
GLOO_PROJECT
}
${
GLOO_PROJECT
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
SHALLOW_CLONE
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
SHALLOW_CLONE
}
GIT_REPOSITORY
${
GLOO_REPOSITORY
}
GIT_REPOSITORY
${
GLOO_REPOSITORY
}
GIT_TAG
${
GLOO_TAG
}
GIT_TAG
${
GLOO_TAG
}
PREFIX
"
${
GLOO_PREFIX_DIR
}
"
PREFIX
"
${
GLOO_PREFIX_DIR
}
"
UPDATE_COMMAND
""
UPDATE_COMMAND
""
PATCH_COMMAND
${
GLOO_PATCH_COMMAND
}
CONFIGURE_COMMAND
""
CONFIGURE_COMMAND
""
BUILD_COMMAND
BUILD_COMMAND
mkdir -p
${
GLOO_SOURCE_DIR
}
/build && cd
${
GLOO_SOURCE_DIR
}
/build && cmake
mkdir -p
${
GLOO_SOURCE_DIR
}
/build && cd
${
GLOO_SOURCE_DIR
}
/build && cmake ..
.. -DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
&&
${
CMAKE_COMMAND
}
--build . &&
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
&&
${
CMAKE_COMMAND
}
--build . && mkdir
mkdir -p
${
GLOO_LIBRARY_DIR
}
${
GLOO_INCLUDE_DIR
}
/gloo
-p
${
GLOO_LIBRARY_DIR
}
${
GLOO_INCLUDE_DIR
}
/glo
INSTALL_COMMAND
${
CMAKE_COMMAND
}
-E copy
INSTALL_COMMAND
${
CMAKE_COMMAND
}
-E copy
${
GLOO_SOURCE_DIR
}
/build/gloo/libgloo.a
${
GLOO_LIBRARY_DIR
}
${
GLOO_SOURCE_DIR
}
/build/gloo/libgloo.a
${
GLOO_LIBRARY_DIR
}
COMMAND
${
CMAKE_COMMAND
}
-E copy_directory
"
${
GLOO_SOURCE_DIR
}
/gloo/"
COMMAND
${
CMAKE_COMMAND
}
-E copy_directory
"
${
GLOO_SOURCE_DIR
}
/gloo/"
"
${
GLOO_INCLUDE_DIR
}
/gloo"
"
${
GLOO_INCLUDE_DIR
}
/gloo"
BUILD_BYPRODUCTS
${
GLOO_LIBRARIES
}
)
BUILD_BYPRODUCTS
${
GLOO_LIBRARIES
}
)
else
()
ExternalProject_Add
(
${
GLOO_PROJECT
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
SHALLOW_CLONE
}
GIT_REPOSITORY
${
GLOO_REPOSITORY
}
GIT_TAG
${
GLOO_TAG
}
PREFIX
"
${
GLOO_PREFIX_DIR
}
"
UPDATE_COMMAND
""
PATCH_COMMAND
${
GLOO_PATCH_COMMAND
}
CONFIGURE_COMMAND
""
BUILD_COMMAND
mkdir -p
${
GLOO_SOURCE_DIR
}
/build && cd
${
GLOO_SOURCE_DIR
}
/build && cmake
.. -DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
&&
${
CMAKE_COMMAND
}
--build . &&
mkdir -p
${
GLOO_LIBRARY_DIR
}
${
GLOO_INCLUDE_DIR
}
/gloo
INSTALL_COMMAND
${
CMAKE_COMMAND
}
-E copy
${
GLOO_SOURCE_DIR
}
/build/gloo/libgloo.a
${
GLOO_LIBRARY_DIR
}
COMMAND
${
CMAKE_COMMAND
}
-E copy_directory
"
${
GLOO_SOURCE_DIR
}
/gloo/"
"
${
GLOO_INCLUDE_DIR
}
/gloo"
BUILD_BYPRODUCTS
${
GLOO_LIBRARIES
}
)
endif
()
add_library
(
gloo STATIC IMPORTED GLOBAL
)
add_library
(
gloo STATIC IMPORTED GLOBAL
)
set_property
(
TARGET gloo PROPERTY IMPORTED_LOCATION
${
GLOO_LIBRARIES
}
)
set_property
(
TARGET gloo PROPERTY IMPORTED_LOCATION
${
GLOO_LIBRARIES
}
)
...
...
cmake/external/protobuf.cmake
浏览文件 @
0b60f28c
...
@@ -237,9 +237,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
...
@@ -237,9 +237,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
if
(
WITH_ASCEND AND NOT WITH_ASCEND_CXX11
)
if
(
WITH_ASCEND AND NOT WITH_ASCEND_CXX11
)
set
(
PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git
)
set
(
PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git
)
set
(
PROTOBUF_TAG v21.12
)
set
(
PROTOBUF_TAG v21.12
)
elseif
(
WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11
)
set
(
PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git
)
set
(
PROTOBUF_TAG v21.12
)
elseif
(
WITH_IPU
)
elseif
(
WITH_IPU
)
set
(
PROTOBUF_REPOSITORY
${
GIT_URL
}
/protocolbuffers/protobuf.git
)
set
(
PROTOBUF_REPOSITORY
${
GIT_URL
}
/protocolbuffers/protobuf.git
)
set
(
PROTOBUF_TAG v21.12
)
set
(
PROTOBUF_TAG v21.12
)
...
@@ -325,9 +322,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
...
@@ -325,9 +322,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
endif
()
endif
()
endfunction
()
endfunction
()
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
if
(
WITH_IPU
)
set
(
PROTOBUF_VERSION 21.12
)
elseif
(
WITH_IPU
)
set
(
PROTOBUF_VERSION 21.12
)
set
(
PROTOBUF_VERSION 21.12
)
elseif
(
WITH_ARM_BRPC
)
elseif
(
WITH_ARM_BRPC
)
set
(
PROTOBUF_VERSION 21.12-baidu-ee-common
)
set
(
PROTOBUF_VERSION 21.12-baidu-ee-common
)
...
...
cmake/external/threadpool.cmake
浏览文件 @
0b60f28c
...
@@ -15,11 +15,7 @@
...
@@ -15,11 +15,7 @@
include
(
ExternalProject
)
include
(
ExternalProject
)
set
(
THREADPOOL_PREFIX_DIR
${
THIRD_PARTY_PATH
}
/threadpool
)
set
(
THREADPOOL_PREFIX_DIR
${
THIRD_PARTY_PATH
}
/threadpool
)
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
set
(
THREADPOOL_REPOSITORY
${
GIT_URL
}
/progschj/ThreadPool.git
)
set
(
THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git
)
else
()
set
(
THREADPOOL_REPOSITORY
${
GIT_URL
}
/progschj/ThreadPool.git
)
endif
()
set
(
THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040
)
set
(
THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040
)
set
(
THREADPOOL_INCLUDE_DIR
${
THIRD_PARTY_PATH
}
/threadpool/src/extern_threadpool
)
set
(
THREADPOOL_INCLUDE_DIR
${
THIRD_PARTY_PATH
}
/threadpool/src/extern_threadpool
)
...
...
cmake/external/warpctc.cmake
浏览文件 @
0b60f28c
...
@@ -64,96 +64,59 @@ else()
...
@@ -64,96 +64,59 @@ else()
set
(
USE_OMP ON
)
set
(
USE_OMP ON
)
endif
()
endif
()
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
if
(
WIN32
)
ExternalProject_Add
(
set
(
WARPCTC_C_FLAGS $<FILTER:
${
CMAKE_C_FLAGS
}
,EXCLUDE,/Zc:inline>
)
extern_warpctc
set
(
WARPCTC_C_FLAGS_DEBUG $<FILTER:
${
CMAKE_C_FLAGS_DEBUG
}
,EXCLUDE,/Zc:inline>
)
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
SHALLOW_CLONE
}
set
(
WARPCTC_C_FLAGS_RELEASE
GIT_REPOSITORY
${
WARPCTC_REPOSITORY
}
$<FILTER:
${
CMAKE_C_FLAGS_RELEASE
}
,EXCLUDE,/Zc:inline>
)
GIT_TAG
${
WARPCTC_TAG
}
set
(
WARPCTC_CXX_FLAGS $<FILTER:
${
CMAKE_CXX_FLAGS
}
,EXCLUDE,/Zc:inline>
)
PREFIX
${
WARPCTC_PREFIX_DIR
}
set
(
WARPCTC_CXX_FLAGS_RELEASE
#UPDATE_COMMAND ""
$<FILTER:
${
CMAKE_CXX_FLAGS_RELEASE
}
,EXCLUDE,/Zc:inline>
)
PATCH_COMMAND
""
set
(
WARPCTC_CXX_FLAGS_DEBUG
BUILD_ALWAYS 1
$<FILTER:
${
CMAKE_CXX_FLAGS_DEBUG
}
,EXCLUDE,/Zc:inline>
)
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
-DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
-DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
-DCMAKE_C_FLAGS_DEBUG=
${
CMAKE_C_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS_RELEASE=
${
CMAKE_C_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_CXX_FLAGS_RELEASE=
${
CMAKE_CXX_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS_DEBUG=
${
CMAKE_CXX_FLAGS_DEBUG
}
-DCMAKE_INSTALL_PREFIX=
${
WARPCTC_INSTALL_DIR
}
-DWITH_GPU=
${
WITH_GPU
}
-DWITH_ROCM=
${
WITH_ROCM
}
-DWITH_OMP=
${
USE_OMP
}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=
${
THIRD_PARTY_BUILD_TYPE
}
${
EXTERNAL_OPTIONAL_ARGS
}
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=
${
THIRD_PARTY_BUILD_TYPE
}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=
${
WARPCTC_INSTALL_DIR
}
BUILD_BYPRODUCTS
${
WARPCTC_LIBRARIES
}
)
else
()
else
()
if
(
WIN32
)
set
(
WARPCTC_C_FLAGS
${
CMAKE_C_FLAGS
}
)
set
(
WARPCTC_C_FLAGS $<FILTER:
${
CMAKE_C_FLAGS
}
,EXCLUDE,/Zc:inline>
)
set
(
WARPCTC_C_FLAGS_DEBUG
${
CMAKE_C_FLAGS_DEBUG
}
)
set
(
WARPCTC_C_FLAGS_DEBUG
set
(
WARPCTC_C_FLAGS_RELEASE
${
CMAKE_C_FLAGS_RELEASE
}
)
$<FILTER:
${
CMAKE_C_FLAGS_DEBUG
}
,EXCLUDE,/Zc:inline>
)
set
(
WARPCTC_CXX_FLAGS
${
CMAKE_CXX_FLAGS
}
)
set
(
WARPCTC_C_FLAGS_RELEASE
set
(
WARPCTC_CXX_FLAGS_RELEASE
${
CMAKE_CXX_FLAGS_RELEASE
}
)
$<FILTER:
${
CMAKE_C_FLAGS_RELEASE
}
,EXCLUDE,/Zc:inline>
)
set
(
WARPCTC_CXX_FLAGS_DEBUG
${
CMAKE_CXX_FLAGS_DEBUG
}
)
set
(
WARPCTC_CXX_FLAGS $<FILTER:
${
CMAKE_CXX_FLAGS
}
,EXCLUDE,/Zc:inline>
)
set
(
WARPCTC_CXX_FLAGS_RELEASE
$<FILTER:
${
CMAKE_CXX_FLAGS_RELEASE
}
,EXCLUDE,/Zc:inline>
)
set
(
WARPCTC_CXX_FLAGS_DEBUG
$<FILTER:
${
CMAKE_CXX_FLAGS_DEBUG
}
,EXCLUDE,/Zc:inline>
)
else
()
set
(
WARPCTC_C_FLAGS
${
CMAKE_C_FLAGS
}
)
set
(
WARPCTC_C_FLAGS_DEBUG
${
CMAKE_C_FLAGS_DEBUG
}
)
set
(
WARPCTC_C_FLAGS_RELEASE
${
CMAKE_C_FLAGS_RELEASE
}
)
set
(
WARPCTC_CXX_FLAGS
${
CMAKE_CXX_FLAGS
}
)
set
(
WARPCTC_CXX_FLAGS_RELEASE
${
CMAKE_CXX_FLAGS_RELEASE
}
)
set
(
WARPCTC_CXX_FLAGS_DEBUG
${
CMAKE_CXX_FLAGS_DEBUG
}
)
endif
()
ExternalProject_Add
(
extern_warpctc
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
SHALLOW_CLONE
}
GIT_REPOSITORY
${
WARPCTC_REPOSITORY
}
GIT_TAG
${
WARPCTC_TAG
}
PREFIX
${
WARPCTC_PREFIX_DIR
}
UPDATE_COMMAND
""
PATCH_COMMAND
${
WARPCTC_PATCH_COMMAND
}
#BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
-DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
-DCMAKE_C_FLAGS=
${
WARPCTC_C_FLAGS
}
-DCMAKE_C_FLAGS_DEBUG=
${
WARPCTC_C_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS_RELEASE=
${
WARPCTC_C_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS=
${
WARPCTC_CXX_FLAGS
}
-DCMAKE_CXX_FLAGS_RELEASE=
${
WARPCTC_CXX_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS_DEBUG=
${
WARPCTC_CXX_FLAGS_DEBUG
}
-DCMAKE_INSTALL_PREFIX=
${
WARPCTC_INSTALL_DIR
}
-DWITH_GPU=
${
WITH_GPU
}
-DWITH_ROCM=
${
WITH_ROCM
}
-DWITH_OMP=
${
USE_OMP
}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=
${
THIRD_PARTY_BUILD_TYPE
}
-DCUDA_TOOLKIT_ROOT_DIR=
${
CUDA_TOOLKIT_ROOT_DIR
}
${
EXTERNAL_OPTIONAL_ARGS
}
${
WARPCTC_CCBIN_OPTION
}
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=
${
THIRD_PARTY_BUILD_TYPE
}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=
${
WARPCTC_INSTALL_DIR
}
BUILD_BYPRODUCTS
${
WARPCTC_LIBRARIES
}
)
endif
()
endif
()
ExternalProject_Add
(
extern_warpctc
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
SHALLOW_CLONE
}
GIT_REPOSITORY
${
WARPCTC_REPOSITORY
}
GIT_TAG
${
WARPCTC_TAG
}
PREFIX
${
WARPCTC_PREFIX_DIR
}
UPDATE_COMMAND
""
PATCH_COMMAND
${
WARPCTC_PATCH_COMMAND
}
#BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
-DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
-DCMAKE_C_FLAGS=
${
WARPCTC_C_FLAGS
}
-DCMAKE_C_FLAGS_DEBUG=
${
WARPCTC_C_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS_RELEASE=
${
WARPCTC_C_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS=
${
WARPCTC_CXX_FLAGS
}
-DCMAKE_CXX_FLAGS_RELEASE=
${
WARPCTC_CXX_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS_DEBUG=
${
WARPCTC_CXX_FLAGS_DEBUG
}
-DCMAKE_INSTALL_PREFIX=
${
WARPCTC_INSTALL_DIR
}
-DWITH_GPU=
${
WITH_GPU
}
-DWITH_ROCM=
${
WITH_ROCM
}
-DWITH_OMP=
${
USE_OMP
}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=
${
THIRD_PARTY_BUILD_TYPE
}
-DCUDA_TOOLKIT_ROOT_DIR=
${
CUDA_TOOLKIT_ROOT_DIR
}
${
EXTERNAL_OPTIONAL_ARGS
}
${
WARPCTC_CCBIN_OPTION
}
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=
${
THIRD_PARTY_BUILD_TYPE
}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=
${
WARPCTC_INSTALL_DIR
}
BUILD_BYPRODUCTS
${
WARPCTC_LIBRARIES
}
)
message
(
STATUS
"warp-ctc library:
${
WARPCTC_LIBRARIES
}
"
)
message
(
STATUS
"warp-ctc library:
${
WARPCTC_LIBRARIES
}
"
)
get_filename_component
(
WARPCTC_LIBRARY_PATH
${
WARPCTC_LIBRARIES
}
DIRECTORY
)
get_filename_component
(
WARPCTC_LIBRARY_PATH
${
WARPCTC_LIBRARIES
}
DIRECTORY
)
...
...
cmake/flags.cmake
浏览文件 @
0b60f28c
...
@@ -167,10 +167,6 @@ if(NOT WIN32)
...
@@ -167,10 +167,6 @@ if(NOT WIN32)
set
(
COMMON_FLAGS
${
COMMON_FLAGS
}
-Wno-sign-compare -Wno-non-virtual-dtor
)
set
(
COMMON_FLAGS
${
COMMON_FLAGS
}
-Wno-sign-compare -Wno-non-virtual-dtor
)
endif
()
endif
()
if
(
WITH_ASCEND_CL AND WITH_ARM_BRPC
)
set
(
COMMON_FLAGS
${
COMMON_FLAGS
}
-faligned-new
)
endif
()
if
(
NOT APPLE
)
if
(
NOT APPLE
)
if
((
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_GREATER 8.0
)
OR
(
WITH_ROCM
))
if
((
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_GREATER 8.0
)
OR
(
WITH_ROCM
))
set
(
COMMON_FLAGS
set
(
COMMON_FLAGS
...
...
cmake/inference_lib.cmake
浏览文件 @
0b60f28c
...
@@ -508,14 +508,9 @@ function(version version_file)
...
@@ -508,14 +508,9 @@ function(version version_file)
OUTPUT_VARIABLE PADDLE_GIT_COMMIT
)
OUTPUT_VARIABLE PADDLE_GIT_COMMIT
)
file
(
file
(
WRITE
${
version_file
}
WRITE
${
version_file
}
"GIT COMMIT ID:
${
PADDLE_GIT_COMMIT
}
\n
"
"GIT COMMIT ID:
${
PADDLE_GIT_COMMIT
}
\n
"
"WITH_MKL:
${
WITH_MKL
}
\n
"
"WITH_MKL:
${
WITH_MKL
}
\n
"
"WITH_MKLDNN:
${
WITH_MKLDNN
}
\n
"
"WITH_GPU:
${
WITH_GPU
}
\n
"
"WITH_MKLDNN:
${
WITH_MKLDNN
}
\n
"
"WITH_ROCM:
${
WITH_ROCM
}
\n
"
"WITH_IPU:
${
WITH_IPU
}
\n
"
)
"WITH_GPU:
${
WITH_GPU
}
\n
"
"WITH_ROCM:
${
WITH_ROCM
}
\n
"
"WITH_ASCEND_CL:
${
WITH_ASCEND_CL
}
\n
"
"WITH_ASCEND_CXX11:
${
WITH_ASCEND_CXX11
}
\n
"
"WITH_IPU:
${
WITH_IPU
}
\n
"
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
file
(
APPEND
${
version_file
}
file
(
APPEND
${
version_file
}
"CUDA version:
${
CUDA_VERSION
}
\n
"
"CUDA version:
${
CUDA_VERSION
}
\n
"
...
@@ -526,11 +521,6 @@ function(version version_file)
...
@@ -526,11 +521,6 @@ function(version version_file)
"HIP version: v
${
HIP_MAJOR_VERSION
}
.
${
HIP_MINOR_VERSION
}
\n
"
"HIP version: v
${
HIP_MAJOR_VERSION
}
.
${
HIP_MINOR_VERSION
}
\n
"
"MIOpen version: v
${
MIOPEN_MAJOR_VERSION
}
.
${
MIOPEN_MINOR_VERSION
}
\n
"
)
"MIOpen version: v
${
MIOPEN_MAJOR_VERSION
}
.
${
MIOPEN_MINOR_VERSION
}
\n
"
)
endif
()
endif
()
if
(
WITH_ASCEND_CL
)
file
(
APPEND
${
version_file
}
"Ascend Toolkit version:
${
ASCEND_TOOLKIT_VERSION
}
\n
"
"Ascend Driver version:
${
ASCEND_DRIVER_VERSION
}
\n
"
)
endif
()
if
(
WITH_IPU
)
if
(
WITH_IPU
)
file
(
APPEND
${
version_file
}
"PopART version:
${
POPART_VERSION
}
\n
"
)
file
(
APPEND
${
version_file
}
"PopART version:
${
POPART_VERSION
}
\n
"
)
endif
()
endif
()
...
...
cmake/operators.cmake
浏览文件 @
0b60f28c
...
@@ -74,9 +74,6 @@ function(op_library TARGET)
...
@@ -74,9 +74,6 @@ function(op_library TARGET)
set
(
MKLDNN_FILE
)
set
(
MKLDNN_FILE
)
set
(
op_common_deps operator op_registry math_function layer
set
(
op_common_deps operator op_registry math_function layer
common_infer_shape_functions
)
common_infer_shape_functions
)
if
(
WITH_ASCEND_CL
)
set
(
op_common_deps
${
op_common_deps
}
npu_op_runner
)
endif
()
if
(
WITH_MLU
)
if
(
WITH_MLU
)
set
(
op_common_deps
${
op_common_deps
}
mlu_baseop
)
set
(
op_common_deps
${
op_common_deps
}
mlu_baseop
)
endif
()
endif
()
...
@@ -175,12 +172,6 @@ function(op_library TARGET)
...
@@ -175,12 +172,6 @@ function(op_library TARGET)
list
(
APPEND xpu_kp_cc_srcs
${
TARGET
}
.kps
)
list
(
APPEND xpu_kp_cc_srcs
${
TARGET
}
.kps
)
endif
()
endif
()
endif
()
endif
()
if
(
WITH_ASCEND_CL
)
string
(
REPLACE
"_op"
"_op_npu"
NPU_FILE
"
${
TARGET
}
"
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
NPU_FILE
}
.cc
)
list
(
APPEND npu_cc_srcs
${
NPU_FILE
}
.cc
)
endif
()
endif
()
if
(
WITH_MLU
)
if
(
WITH_MLU
)
string
(
REPLACE
"_op"
"_op_mlu"
MLU_FILE
"
${
TARGET
}
"
)
string
(
REPLACE
"_op"
"_op_mlu"
MLU_FILE
"
${
TARGET
}
"
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
MLU_FILE
}
.cc
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
MLU_FILE
}
.cc
)
...
@@ -213,8 +204,6 @@ function(op_library TARGET)
...
@@ -213,8 +204,6 @@ function(op_library TARGET)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
elseif
(
WITH_XPU_KP AND
${
src
}
MATCHES
".*
\\
.kps$"
)
elseif
(
WITH_XPU_KP AND
${
src
}
MATCHES
".*
\\
.kps$"
)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
elseif
(
WITH_ASCEND_CL AND
${
src
}
MATCHES
".*_op_npu.cc$"
)
list
(
APPEND npu_cc_srcs
${
src
}
)
elseif
(
WITH_MLU AND
${
src
}
MATCHES
".*_op_mlu.cc$"
)
elseif
(
WITH_MLU AND
${
src
}
MATCHES
".*_op_mlu.cc$"
)
list
(
APPEND mlu_cc_srcs
${
src
}
)
list
(
APPEND mlu_cc_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*
\\
.cc$"
)
elseif
(
${
src
}
MATCHES
".*
\\
.cc$"
)
...
@@ -331,13 +320,6 @@ function(op_library TARGET)
...
@@ -331,13 +320,6 @@ function(op_library TARGET)
SRCS
${
cc_srcs
}
${
mkldnn_cc_srcs
}
${
xpu_cc_srcs
}
${
xpu_kp_cc_srcs
}
SRCS
${
cc_srcs
}
${
mkldnn_cc_srcs
}
${
xpu_cc_srcs
}
${
xpu_kp_cc_srcs
}
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
else
()
else
()
# deal with CANN version control while registering NPU operators before build
if
(
WITH_ASCEND_CL
)
if
(
CANN_VERSION LESS 504000
)
list
(
REMOVE_ITEM npu_cc_srcs
"multinomial_op_npu.cc"
)
list
(
REMOVE_ITEM npu_cc_srcs
"take_along_axis_op_npu.cc"
)
endif
()
endif
()
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
if
(
WITH_UNITY_BUILD AND op_library_UNITY
)
if
(
WITH_UNITY_BUILD AND op_library_UNITY
)
# Combine the cc source files.
# Combine the cc source files.
...
@@ -541,18 +523,6 @@ function(op_library TARGET)
...
@@ -541,18 +523,6 @@ function(op_library TARGET)
endforeach
()
endforeach
()
endif
()
endif
()
# pybind USE_OP_DEVICE_KERNEL for NPU
if
(
WITH_ASCEND_CL AND
${
npu_cc_srcs_len
}
GREATER 0
)
foreach
(
npu_src
${
npu_cc_srcs
}
)
set
(
op_name
""
)
find_register
(
${
npu_src
}
"REGISTER_OP_NPU_KERNEL"
op_name
)
if
(
NOT
${
op_name
}
EQUAL
""
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
op_name
}
, NPU);
\n
"
)
set
(
pybind_flag 1
)
endif
()
endforeach
()
endif
()
# pybind USE_OP_DEVICE_KERNEL for MLU
# pybind USE_OP_DEVICE_KERNEL for MLU
if
(
WITH_MLU AND
${
mlu_cc_srcs_len
}
GREATER 0
)
if
(
WITH_MLU AND
${
mlu_cc_srcs_len
}
GREATER 0
)
foreach
(
mlu_src
${
mlu_cc_srcs
}
)
foreach
(
mlu_src
${
mlu_cc_srcs
}
)
...
...
cmake/third_party.cmake
浏览文件 @
0b60f28c
...
@@ -394,16 +394,6 @@ if(WITH_BOX_PS)
...
@@ -394,16 +394,6 @@ if(WITH_BOX_PS)
list
(
APPEND third_party_deps extern_box_ps
)
list
(
APPEND third_party_deps extern_box_ps
)
endif
()
endif
()
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
include
(
external/ascend
)
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
list
(
APPEND third_party_deps extern_ascend
)
endif
()
if
(
WITH_ASCEND_CL
)
list
(
APPEND third_party_deps extern_ascend_cl
)
endif
()
endif
()
if
(
WITH_PSCORE
)
if
(
WITH_PSCORE
)
include
(
external/snappy
)
include
(
external/snappy
)
list
(
APPEND third_party_deps extern_snappy
)
list
(
APPEND third_party_deps extern_snappy
)
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
0b60f28c
...
@@ -205,17 +205,10 @@ elseif(WITH_ROCM)
...
@@ -205,17 +205,10 @@ elseif(WITH_ROCM)
SRCS fused_broadcast_op_handle.cc
SRCS fused_broadcast_op_handle.cc
DEPS broadcast_op_handle
)
DEPS broadcast_op_handle
)
else
()
else
()
if
(
WITH_ASCEND_CL
)
cc_library
(
cc_library
(
nan_inf_utils
nan_inf_utils
SRCS nan_inf_utils_detail.cc
SRCS nan_inf_utils_detail.cc
DEPS framework_proto scope place
)
DEPS npu_op_runner framework_proto scope place
)
else
()
cc_library
(
nan_inf_utils
SRCS nan_inf_utils_detail.cc
DEPS framework_proto scope place
)
endif
()
cc_library
(
cc_library
(
all_reduce_op_handle
all_reduce_op_handle
SRCS all_reduce_op_handle.cc
SRCS all_reduce_op_handle.cc
...
...
paddle/fluid/framework/details/nan_inf_utils.h
浏览文件 @
0b60f28c
...
@@ -54,12 +54,6 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
...
@@ -54,12 +54,6 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
}
}
}
}
#ifdef PADDLE_WITH_ASCEND_CL
void
NPUAllocAndClearFloatStatus
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
);
#endif
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/details/nan_inf_utils_detail.cc
浏览文件 @
0b60f28c
...
@@ -19,8 +19,6 @@
...
@@ -19,8 +19,6 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/amp_type_traits.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
...
@@ -243,40 +241,6 @@ void CheckVarHasNanOrInf(const std::string& op_type,
...
@@ -243,40 +241,6 @@ void CheckVarHasNanOrInf(const std::string& op_type,
"phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
"phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
"with XPU."
,
"with XPU."
,
var_name
));
var_name
));
#endif
return
;
}
else
if
(
platform
::
is_npu_place
(
tensor
->
place
()))
{
#ifdef PADDLE_WITH_ASCEND_CL
if
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
())
!=
proto
::
VarType
::
FP32
)
{
return
;
}
phi
::
DenseTensor
cpu_tensor
;
cpu_tensor
.
Resize
(
tensor
->
dims
());
float
*
cpu_data
=
static_cast
<
float
*>
(
cpu_tensor
.
mutable_data
(
platform
::
CPUPlace
(),
tensor
->
dtype
()));
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
bool
flag
=
false
;
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
i
++
)
{
if
(
isnan
(
cpu_data
[
i
])
||
isinf
(
cpu_data
[
i
]))
{
flag
=
true
;
break
;
}
}
PADDLE_ENFORCE_NE
(
flag
,
true
,
platform
::
errors
::
Fatal
(
"Operator %s output phi::DenseTensor %s contains Inf."
,
op_type
,
var_name
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"phi::DenseTensor[%s] use npu place. PaddlePaddle must compile "
"with NPU."
,
var_name
));
#endif
#endif
return
;
return
;
}
}
...
@@ -309,139 +273,6 @@ bool IsSkipOp(const framework::OperatorBase& op) {
...
@@ -309,139 +273,6 @@ bool IsSkipOp(const framework::OperatorBase& op) {
return
false
;
return
false
;
}
}
#ifdef PADDLE_WITH_ASCEND_CL
using
NpuOpRunner
=
paddle
::
operators
::
NpuOpRunner
;
constexpr
int
FLOAT_STATUS_SIZE
=
8
;
static
phi
::
DenseTensor
&
npu_float_status
()
{
static
phi
::
DenseTensor
float_status
;
return
float_status
;
}
void
NPUAllocAndClearFloatStatus
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
{
if
(
!
platform
::
is_npu_place
(
place
))
return
;
std
::
call_once
(
white_list_init_flag
,
InitWhiteListFormEnv
);
if
(
IsSkipOp
(
op
))
return
;
auto
*
dev_ctx
=
reinterpret_cast
<
platform
::
NPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
stream
=
dev_ctx
->
stream
();
auto
&
flag
=
npu_float_status
();
flag
.
mutable_data
<
float
>
({
FLOAT_STATUS_SIZE
},
place
);
NpuOpRunner
(
"NPUAllocFloatStatus"
,
{},
{
flag
}).
Run
(
stream
);
phi
::
DenseTensor
tmp
;
tmp
.
mutable_data
<
float
>
({
FLOAT_STATUS_SIZE
},
place
);
NpuOpRunner
(
"NPUClearFloatStatus"
,
{
tmp
},
{
flag
}).
Run
(
stream
);
}
void
PrintNpuVarInfo
(
const
std
::
string
&
op_type
,
const
std
::
string
&
var_name
,
const
framework
::
Variable
*
var
,
const
platform
::
Place
&
place
)
{
const
phi
::
DenseTensor
*
tensor
{
nullptr
};
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
tensor
=
&
var
->
Get
<
phi
::
DenseTensor
>
();
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
tensor
=
&
var
->
Get
<
phi
::
SelectedRows
>
().
value
();
}
else
{
VLOG
(
10
)
<<
var_name
<<
" var_name need not to check"
;
return
;
}
if
((
framework
::
TransToProtoVarType
(
tensor
->
dtype
())
!=
proto
::
VarType
::
FP32
)
&&
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
())
!=
proto
::
VarType
::
FP16
))
{
return
;
}
if
(
tensor
->
memory_size
()
==
0
)
{
VLOG
(
10
)
<<
var_name
<<
" var_name need not to check, size == 0"
;
return
;
}
VLOG
(
10
)
<<
"begin check "
<<
op_type
<<
" var_name:"
<<
var_name
<<
", place:"
<<
tensor
->
place
()
<<
", numel:"
<<
tensor
->
numel
();
phi
::
DenseTensor
cpu_tensor
;
cpu_tensor
.
Resize
(
tensor
->
dims
());
cpu_tensor
.
mutable_data
(
platform
::
CPUPlace
(),
tensor
->
dtype
());
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
LOG
(
WARNING
)
<<
"print ["
<<
var_name
<<
"] tensor info:"
;
// use env strategy control in future, -1=print_all.
int
print_num
=
3
;
if
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
())
==
proto
::
VarType
::
FP32
)
{
const
float
*
value
=
cpu_tensor
.
data
<
float
>
();
PrintNanInf
(
value
,
tensor
->
numel
(),
print_num
,
op_type
,
var_name
,
false
);
}
else
if
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
())
==
proto
::
VarType
::
FP16
)
{
const
paddle
::
platform
::
float16
*
value
=
cpu_tensor
.
data
<
paddle
::
platform
::
float16
>
();
PrintNanInf
(
value
,
tensor
->
numel
(),
print_num
,
op_type
,
var_name
,
false
);
}
}
void
PrintNPUOpValueInfo
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
{
LOG
(
WARNING
)
<<
"There are `nan` or `inf` in operator ("
<<
op
.
Type
()
<<
"), here we print some tensor value info of this op."
;
for
(
auto
&
vname
:
op
.
InputVars
())
{
auto
*
var
=
scope
.
FindVar
(
vname
);
if
(
var
==
nullptr
)
continue
;
PrintNpuVarInfo
(
op
.
Type
(),
vname
,
var
,
place
);
}
for
(
auto
&
vname
:
op
.
OutputVars
(
true
))
{
auto
*
var
=
scope
.
FindVar
(
vname
);
if
(
var
==
nullptr
)
continue
;
PrintNpuVarInfo
(
op
.
Type
(),
vname
,
var
,
place
);
}
}
static
void
NPUCheckOpHasNanOrInf
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
{
if
(
!
platform
::
is_npu_place
(
place
))
return
;
auto
*
dev_ctx
=
reinterpret_cast
<
platform
::
NPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
stream
=
dev_ctx
->
stream
();
auto
&
flag
=
npu_float_status
();
phi
::
DenseTensor
tmp
;
tmp
.
mutable_data
<
float
>
({
FLOAT_STATUS_SIZE
},
place
);
// NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder.
NpuOpRunner
(
"NPUGetFloatStatus"
,
{
flag
},
{
tmp
}).
Run
(
stream
);
phi
::
DenseTensor
cpu_tensor
;
auto
cpu_place
=
platform
::
CPUPlace
();
float
*
cpu_data
=
static_cast
<
float
*>
(
cpu_tensor
.
mutable_data
<
float
>
({
FLOAT_STATUS_SIZE
},
cpu_place
));
framework
::
TensorCopySync
(
flag
,
cpu_place
,
&
cpu_tensor
);
float
sum
=
0.0
;
for
(
int
i
=
0
;
i
<
FLOAT_STATUS_SIZE
;
++
i
)
{
sum
+=
cpu_data
[
i
];
}
if
(
sum
>=
1.0
)
PrintNPUOpValueInfo
(
op
,
scope
,
place
);
PADDLE_ENFORCE_LT
(
sum
,
1.0
,
platform
::
errors
::
PreconditionNotMet
(
"Operator %s contains Nan/Inf."
,
op
.
Type
()));
}
#endif
void
CheckOpHasNanOrInf
(
const
framework
::
OperatorBase
&
op
,
void
CheckOpHasNanOrInf
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
Scope
&
exec_scope
,
const
framework
::
Scope
&
exec_scope
,
const
platform
::
Place
&
place
)
{
const
platform
::
Place
&
place
)
{
...
@@ -449,13 +280,6 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op,
...
@@ -449,13 +280,6 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op,
if
(
IsSkipOp
(
op
))
return
;
if
(
IsSkipOp
(
op
))
return
;
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
place
))
{
NPUCheckOpHasNanOrInf
(
op
,
exec_scope
,
place
);
return
;
}
#endif
if
(
op_var_nan_inf_white_list
().
count
(
op
.
Type
())
==
0
)
{
if
(
op_var_nan_inf_white_list
().
count
(
op
.
Type
())
==
0
)
{
// NOTE. vname may destruct in the end of this func.
// NOTE. vname may destruct in the end of this func.
for
(
auto
&
vname
:
op
.
OutputVars
(
true
))
{
for
(
auto
&
vname
:
op
.
OutputVars
(
true
))
{
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
0b60f28c
...
@@ -674,8 +674,7 @@ class PSGPUWorker : public HogwildWorker {
...
@@ -674,8 +674,7 @@ class PSGPUWorker : public HogwildWorker {
};
};
#endif
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
class
SectionWorker
:
public
DeviceWorker
{
class
SectionWorker
:
public
DeviceWorker
{
public:
public:
SectionWorker
()
{}
SectionWorker
()
{}
...
...
paddle/fluid/framework/device_worker_factory.cc
浏览文件 @
0b60f28c
...
@@ -83,8 +83,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
...
@@ -83,8 +83,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
REGISTER_DEVICE_WORKER_CLASS
(
PSGPUWorker
);
REGISTER_DEVICE_WORKER_CLASS
(
PSGPUWorker
);
#endif
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
REGISTER_DEVICE_WORKER_CLASS
(
SectionWorker
);
REGISTER_DEVICE_WORKER_CLASS
(
SectionWorker
);
#endif
#endif
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
0b60f28c
...
@@ -516,23 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
...
@@ -516,23 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
#else
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"No IPU gc found in CPU/IPU paddle"
));
platform
::
errors
::
Unimplemented
(
"No IPU gc found in CPU/IPU paddle"
));
#endif
}
else
if
(
platform
::
is_npu_place
(
place_
))
{
#ifdef PADDLE_WITH_ASCEND_CL
if
(
IsFastEagerDeletionModeEnabled
())
{
VLOG
(
4
)
<<
"Use unsafe fast gc for NPU."
;
gc
.
reset
(
new
NPUUnsafeFastGarbageCollector
(
place_
,
max_memory_size
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."
));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG
(
4
)
<<
"Use default stream gc for NPU."
;
gc
.
reset
(
new
NPUDefaultStreamGarbageCollector
(
place_
,
max_memory_size
));
}
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"No NPU gc found in CPU/NPU paddle"
));
#endif
#endif
}
else
if
(
platform
::
is_mlu_place
(
place_
))
{
}
else
if
(
platform
::
is_mlu_place
(
place_
))
{
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
...
...
paddle/fluid/framework/fleet/CMakeLists.txt
浏览文件 @
0b60f28c
...
@@ -124,10 +124,3 @@ cc_test(
...
@@ -124,10 +124,3 @@ cc_test(
test_fleet_cc
test_fleet_cc
SRCS test_fleet.cc
SRCS test_fleet.cc
DEPS fleet_wrapper gloo_wrapper fs shell
)
DEPS fleet_wrapper gloo_wrapper fs shell
)
if
(
WITH_ASCEND OR WITH_ASCEND_CL
)
cc_library
(
ascend_wrapper
SRCS ascend_wrapper.cc
DEPS framework_proto lod_tensor ascend_ge ascend_graph
)
endif
()
paddle/fluid/framework/fleet/ascend_wrapper.cc
已删除
100644 → 0
浏览文件 @
04f8c24e
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
namespace
paddle
{
namespace
framework
{
std
::
shared_ptr
<
AscendInstance
>
AscendInstance
::
ascend_instance_
=
nullptr
;
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/ascend_wrapper.h
已删除
100644 → 0
浏览文件 @
04f8c24e
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <glog/logging.h>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "ge/ge_api.h"
#include "graph/attr_value.h"
#include "graph/tensor.h"
#include "graph/types.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
namespace
paddle
{
namespace
framework
{
typedef
ge
::
Graph
AscendGraphDesc
;
#ifdef PADDLE_WITH_ASCEND_STRING
using
AscendString
=
ge
::
AscendString
;
#else
using
AscendString
=
std
::
string
;
#endif
class
AscendInstance
{
public:
virtual
~
AscendInstance
()
{}
AscendInstance
()
{}
std
::
map
<
AscendString
,
AscendString
>
_GetDefaultInitOptions
()
{
std
::
map
<
AscendString
,
AscendString
>
init_options
;
init_options
[
"ge.exec.deviceId"
]
=
"0"
;
init_options
[
"ge.graphRunMode"
]
=
"1"
;
return
init_options
;
}
std
::
map
<
AscendString
,
AscendString
>
_GetDefaultInitSessionOptions
()
{
std
::
map
<
AscendString
,
AscendString
>
init_options
;
// init_options["a"] = "b";
// init_options["ge.trainFlag"] = "1";
return
init_options
;
}
ge
::
Status
InitGEForUT
()
{
return
ge
::
GEInitialize
(
_GetDefaultInitOptions
());
}
void
InitGlobalResouces
()
{
LOG
(
INFO
)
<<
"Begin ascend InitGlobalResouces"
;
session_
.
reset
(
new
ge
::
Session
(
_GetDefaultInitSessionOptions
()));
if
(
session_
==
nullptr
)
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"new session error: nullptr"
));
}
LOG
(
INFO
)
<<
"End ascend InitGlobalResouces"
;
}
void
DestroyGlobalResouces
()
{
LOG
(
INFO
)
<<
"Begin ascend DestroyGlobalResouces"
;
session_
=
nullptr
;
LOG
(
INFO
)
<<
"Begin ascend DestroyGlobalResouces"
;
}
static
std
::
shared_ptr
<
AscendInstance
>
GetInstance
()
{
if
(
nullptr
==
ascend_instance_
)
{
ascend_instance_
.
reset
(
new
paddle
::
framework
::
AscendInstance
());
VLOG
(
1
)
<<
"Initialize AscendInstance Done"
;
}
return
ascend_instance_
;
}
void
AddAscendSubgraph
(
int
graph_idx
,
const
AscendGraphDesc
&
graph
)
{
ge
::
Status
status
=
session_
->
AddGraph
(
graph_idx
,
graph
);
PADDLE_ENFORCE_EQ
(
status
,
ge
::
SUCCESS
,
paddle
::
platform
::
errors
::
PreconditionNotMet
(
"Calling addGraph of graph engine failed, please "
"check Ascend Log."
));
VLOG
(
1
)
<<
"AddAscendSubgraph "
<<
graph_idx
<<
" Done"
;
}
ge
::
DataType
VarTypeToGeType
(
proto
::
VarType
::
Type
type
)
{
if
(
type
==
proto
::
VarType
::
FP16
)
{
return
ge
::
DataType
::
DT_FLOAT16
;
}
else
if
(
type
==
proto
::
VarType
::
FP32
)
{
return
ge
::
DataType
::
DT_FLOAT
;
}
else
if
(
type
==
proto
::
VarType
::
FP64
)
{
return
ge
::
DataType
::
DT_DOUBLE
;
}
else
if
(
type
==
proto
::
VarType
::
INT32
)
{
return
ge
::
DataType
::
DT_INT32
;
}
else
if
(
type
==
proto
::
VarType
::
INT64
)
{
return
ge
::
DataType
::
DT_INT64
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Not support %s as tensor type."
,
DataTypeToString
(
type
)));
}
}
int
GeTypeSize
(
proto
::
VarType
::
Type
type
)
{
if
(
type
==
proto
::
VarType
::
FP16
)
{
return
2
;
}
else
if
(
type
==
proto
::
VarType
::
FP32
)
{
return
4
;
}
else
if
(
type
==
proto
::
VarType
::
FP64
)
{
return
8
;
}
else
if
(
type
==
proto
::
VarType
::
INT32
)
{
return
4
;
}
else
if
(
type
==
proto
::
VarType
::
INT64
)
{
return
8
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Not support %s as tensor type."
,
DataTypeToString
(
type
)));
}
}
ge
::
Tensor
ConvertToGeTensor
(
const
phi
::
DenseTensor
*
tensor
)
{
auto
numel
=
tensor
->
numel
();
std
::
vector
<
int64_t
>
vec_dim
;
auto
dimen
=
arity
(
tensor
->
dims
());
for
(
auto
i
=
0
;
i
<
dimen
;
++
i
)
{
vec_dim
.
push_back
(
tensor
->
dims
()[
i
]);
}
// For Debug
// VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
// ", and shape is";
// for (const auto e : vec_dim) {
// VLOG(0) << e;
// }
ge
::
Shape
shape
(
vec_dim
);
ge
::
TensorDesc
tensor_desc
(
shape
,
ge
::
Format
::
FORMAT_ND
,
VarTypeToGeType
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
())));
tensor_desc
.
SetRealDimCnt
(
vec_dim
.
size
());
const
uint8_t
*
data
=
reinterpret_cast
<
const
uint8_t
*>
(
tensor
->
data
());
std
::
vector
<
uint8_t
>
dst
(
numel
*
GeTypeSize
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
())));
memcpy
(
dst
.
data
(),
data
,
GeTypeSize
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
()))
*
numel
);
ge
::
Tensor
ge_tensor
(
tensor_desc
,
dst
);
return
ge_tensor
;
}
void
RunAscendSubgraph
(
int
graph_idx
,
const
std
::
vector
<
const
phi
::
DenseTensor
*>
&
inputs
,
std
::
vector
<
phi
::
DenseTensor
*>
*
outputs
)
{
VLOG
(
1
)
<<
"Ascend Graph["
<<
graph_idx
<<
"] is about to run."
;
// Convert paddle phi::DenseTensor to GE phi::DenseTensor
std
::
vector
<
ge
::
Tensor
>
ge_inputs
;
for
(
const
auto
&
e
:
inputs
)
{
ge_inputs
.
push_back
(
ConvertToGeTensor
(
e
));
}
// Run Graph
std
::
vector
<
ge
::
Tensor
>
ge_outputs
;
ge
::
Status
status
=
session_
->
RunGraph
(
graph_idx
,
ge_inputs
,
ge_outputs
);
PADDLE_ENFORCE_EQ
(
status
,
ge
::
SUCCESS
,
paddle
::
platform
::
errors
::
PreconditionNotMet
(
"Calling RunGraph of graph engine failed, please "
"check Ascend Log."
));
VLOG
(
1
)
<<
"Run Ascend Graph["
<<
graph_idx
<<
"] Done"
;
// change tensor back, note all tensor's type computed in GE is uint8
for
(
size_t
i
=
0
;
i
<
ge_outputs
.
size
();
++
i
)
{
const
uint8_t
*
ret_data
=
ge_outputs
[
i
].
GetData
();
size_t
size
=
ge_outputs
[
i
].
GetSize
();
VLOG
(
1
)
<<
"GE phi::DenseTensor size of the "
<<
i
<<
"th output var is "
<<
size
;
auto
*
dst
=
(
*
outputs
)[
i
]
->
mutable_data
<
uint8_t
>
({(
int64_t
)
size
},
platform
::
CPUPlace
());
memcpy
(
dst
,
ret_data
,
size
);
// Following for debug:
// VLOG(0) << "output for " << i << " var: ";
// float *tmp = reinterpret_cast<float*>(dst);
// for (size_t j = 0; j < size / 4; ++j) {
// printf("%f ", tmp[j]);
// }
// printf("\n");
}
}
protected:
std
::
shared_ptr
<
ge
::
Session
>
session_
;
private:
static
std
::
shared_ptr
<
AscendInstance
>
ascend_instance_
;
};
}
// namespace framework
}
// namespace paddle
#endif
paddle/fluid/framework/garbage_collector.cc
浏览文件 @
0b60f28c
...
@@ -125,32 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
...
@@ -125,32 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDefaultStreamGarbageCollector
::
NPUDefaultStreamGarbageCollector
(
const
platform
::
NPUPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
void
NPUDefaultStreamGarbageCollector
::
Wait
()
const
{
static_cast
<
platform
::
NPUDeviceContext
*>
(
this
->
dev_ctx_
)
->
WaitStreamCallback
();
}
void
NPUDefaultStreamGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
static_cast
<
platform
::
NPUDeviceContext
*>
(
this
->
dev_ctx_
)
->
AddStreamCallback
(
callback
);
}
NPUUnsafeFastGarbageCollector
::
NPUUnsafeFastGarbageCollector
(
const
platform
::
NPUPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
void
NPUUnsafeFastGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback
();
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
MLUDefaultStreamGarbageCollector
::
MLUDefaultStreamGarbageCollector
(
MLUDefaultStreamGarbageCollector
::
MLUDefaultStreamGarbageCollector
(
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
)
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
)
...
...
paddle/fluid/framework/garbage_collector.h
浏览文件 @
0b60f28c
...
@@ -139,28 +139,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
...
@@ -139,28 +139,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
};
};
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class
NPUDefaultStreamGarbageCollector
:
public
GarbageCollector
{
public:
NPUDefaultStreamGarbageCollector
(
const
platform
::
NPUPlace
&
place
,
size_t
max_memory_size
);
void
Wait
()
const
override
;
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
};
class
NPUUnsafeFastGarbageCollector
:
public
GarbageCollector
{
public:
NPUUnsafeFastGarbageCollector
(
const
platform
::
NPUPlace
&
place
,
size_t
max_memory_size
);
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
};
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
class
MLUDefaultStreamGarbageCollector
:
public
GarbageCollector
{
class
MLUDefaultStreamGarbageCollector
:
public
GarbageCollector
{
public:
public:
...
...
paddle/fluid/framework/new_executor/interpreter/execution_config.cc
浏览文件 @
0b60f28c
...
@@ -60,11 +60,6 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
...
@@ -60,11 +60,6 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
if
(
platform
::
is_xpu_place
(
place
))
{
if
(
platform
::
is_xpu_place
(
place
))
{
#if defined(PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_XPU)
device_count
=
phi
::
backends
::
xpu
::
GetXPUDeviceCount
();
device_count
=
phi
::
backends
::
xpu
::
GetXPUDeviceCount
();
#endif
}
if
(
platform
::
is_npu_place
(
place
))
{
#if defined(PADDLE_WITH_ASCEND_CL)
device_count
=
platform
::
GetNPUDeviceCount
();
#endif
#endif
}
}
if
(
platform
::
is_ipu_place
(
place
))
{
if
(
platform
::
is_ipu_place
(
place
))
{
...
...
paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
浏览文件 @
0b60f28c
...
@@ -631,16 +631,6 @@ void BuildOpFuncList(const platform::Place& place,
...
@@ -631,16 +631,6 @@ void BuildOpFuncList(const platform::Place& place,
VLOG
(
4
)
<<
"Start run "
<<
place
<<
" "
<<
op
->
DebugStringEx
(
local_scope
);
VLOG
(
4
)
<<
"Start run "
<<
place
<<
" "
<<
op
->
DebugStringEx
(
local_scope
);
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
// values, but only through special `float_status` to checks whether
// the operation is overflow. More about `float_status`, see:
// https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
if
(
FLAGS_check_nan_inf
)
{
framework
::
details
::
NPUAllocAndClearFloatStatus
(
*
op
,
*
local_scope
,
place
);
}
#endif
try
{
try
{
if
(
dynamic_cast
<
framework
::
OperatorWithKernel
*>
(
op
)
==
nullptr
)
{
if
(
dynamic_cast
<
framework
::
OperatorWithKernel
*>
(
op
)
==
nullptr
)
{
VLOG
(
4
)
<<
"HandleOperatorBase"
;
VLOG
(
4
)
<<
"HandleOperatorBase"
;
...
...
paddle/fluid/framework/new_executor/interpretercore.cc
浏览文件 @
0b60f28c
...
@@ -87,16 +87,6 @@ inline void SetDeviceId(const platform::Place& place) {
...
@@ -87,16 +87,6 @@ inline void SetDeviceId(const platform::Place& place) {
#else
#else
auto
dev_id
=
place
.
device
;
auto
dev_id
=
place
.
device
;
platform
::
SetXPUDeviceId
(
dev_id
);
platform
::
SetXPUDeviceId
(
dev_id
);
#endif
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with NPU support."
,
place
));
#else
auto
dev_id
=
place
.
device
;
platform
::
SetNPUDeviceId
(
dev_id
);
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#ifndef PADDLE_WITH_CUSTOM_DEVICE
#ifndef PADDLE_WITH_CUSTOM_DEVICE
...
@@ -218,11 +208,6 @@ void InterpreterCore::RunImpl() {
...
@@ -218,11 +208,6 @@ void InterpreterCore::RunImpl() {
async_work_queue_
=
GetWorkQueue
();
async_work_queue_
=
GetWorkQueue
();
ExecuteInstructionList
(
vec_instruction_
);
ExecuteInstructionList
(
vec_instruction_
);
}
}
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
place_
))
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
)
->
Wait
();
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if
(
platform
::
is_custom_place
(
place_
))
{
if
(
platform
::
is_custom_place
(
place_
))
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
)
->
Wait
();
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
)
->
Wait
();
...
@@ -893,18 +878,6 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) {
...
@@ -893,18 +878,6 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) {
:
var_scope_
.
GetMutableScope
();
:
var_scope_
.
GetMutableScope
();
VLOG
(
4
)
<<
"Start run "
<<
place
<<
" "
<<
op
->
DebugStringEx
(
local_scope
);
VLOG
(
4
)
<<
"Start run "
<<
place
<<
" "
<<
op
->
DebugStringEx
(
local_scope
);
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
place
))
{
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the
// variable values, but only through special `float_status` to checks
// whether the operation is overflow. More about `float_status`, see:
// https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
if
(
FLAGS_check_nan_inf
)
{
framework
::
details
::
NPUAllocAndClearFloatStatus
(
*
op
,
*
local_scope
,
place
);
}
}
#endif
auto
op_with_kernel
=
dynamic_cast
<
const
framework
::
OperatorWithKernel
*>
(
op
);
auto
op_with_kernel
=
dynamic_cast
<
const
framework
::
OperatorWithKernel
*>
(
op
);
{
{
// If it is OperatorBase, InferShape do nothing.
// If it is OperatorBase, InferShape do nothing.
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
0b60f28c
...
@@ -770,16 +770,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
...
@@ -770,16 +770,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
#else
auto
dev_id
=
place
.
device
;
auto
dev_id
=
place
.
device
;
platform
::
SetXPUDeviceId
(
dev_id
);
platform
::
SetXPUDeviceId
(
dev_id
);
#endif
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with NPU support."
,
place
));
#else
auto
dev_id
=
place
.
device
;
platform
::
SetNPUDeviceId
(
dev_id
);
#endif
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifndef PADDLE_WITH_MLU
#ifndef PADDLE_WITH_MLU
...
@@ -1692,17 +1682,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
...
@@ -1692,17 +1682,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
bool
fallback_to_cpu
=
false
;
bool
fallback_to_cpu
=
false
;
auto
*
dev_ctx
=
pool
.
Get
(
place
);
auto
*
dev_ctx
=
pool
.
Get
(
place
);
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
// values, but only through special `float_status` to checks whether
// the operation is overflow. More about `float_status`, see:
// https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
if
(
FLAGS_check_nan_inf
)
{
framework
::
details
::
NPUAllocAndClearFloatStatus
(
*
this
,
scope
,
place
);
}
#endif
// using cache
// using cache
if
(
kernel_type_
.
get
())
{
if
(
kernel_type_
.
get
())
{
dev_ctx
=
pool
.
Get
(
kernel_type_
->
place_
);
dev_ctx
=
pool
.
Get
(
kernel_type_
->
place_
);
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
0b60f28c
...
@@ -553,20 +553,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
...
@@ -553,20 +553,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use IPU device since it's not compiled with IPU,"
"Paddle can't use IPU device since it's not compiled with IPU,"
"Please recompile or reinstall Paddle with IPU support."
));
"Please recompile or reinstall Paddle with IPU support."
));
#endif
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#if defined(PADDLE_WITH_ASCEND_CL)
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
NPUUnsafeFastGarbageCollector
(
place
,
max_memory_size
));
}
else
{
gc
.
reset
(
new
NPUUnsafeFastGarbageCollector
(
place
,
max_memory_size
));
}
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use NPU device since it's not compiled with "
"NPU,"
"Please recompile or reinstall Paddle with NPU support."
));
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
...
...
paddle/fluid/framework/phi_utils.cc
浏览文件 @
0b60f28c
...
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
...
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
phi
::
Backend
::
CPU
,
kernel_key
.
layout
(),
kernel_key
.
dtype
());
phi
::
Backend
::
CPU
,
kernel_key
.
layout
(),
kernel_key
.
dtype
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if
(
kernel_key
.
backend
()
==
phi
::
Backend
::
NPU
)
{
VLOG
(
3
)
<<
"phi missing NPU kernel: "
<<
op
.
Type
()
<<
", expected_kernel_key:"
<<
kernel_key
<<
", fallback to CPU one!"
;
return
phi
::
KernelKey
(
phi
::
Backend
::
CPU
,
kernel_key
.
layout
(),
kernel_key
.
dtype
());
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
if
(
kernel_key
.
backend
()
==
phi
::
Backend
::
MLU
)
{
if
(
kernel_key
.
backend
()
==
phi
::
Backend
::
MLU
)
{
VLOG
(
3
)
<<
"phi missing MLU kernel: "
<<
op
.
Type
()
VLOG
(
3
)
<<
"phi missing MLU kernel: "
<<
op
.
Type
()
...
...
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
0b60f28c
...
@@ -12,8 +12,7 @@
...
@@ -12,8 +12,7 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/framework/trainer.h"
...
@@ -37,8 +36,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
...
@@ -37,8 +36,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
int
place_id
=
section_config
.
place_id
();
int
place_id
=
section_config
.
place_id
();
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
place_
=
platform
::
CUDAPlace
(
place_id
);
place_
=
platform
::
CUDAPlace
(
place_id
);
#elif (defined PADDLE_WITH_ASCEND_CL) // NOLINT
place_
=
platform
::
NPUPlace
(
place_id
);
#endif
#endif
worker_
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
worker_
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
trainer_desc
.
device_worker_name
());
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
0b60f28c
...
@@ -9,8 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -9,8 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
#include <float.h>
#include <float.h>
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker.h"
...
@@ -235,18 +234,6 @@ void SectionWorker::TrainFiles() {
...
@@ -235,18 +234,6 @@ void SectionWorker::TrainFiles() {
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
place_
,
max_memory_size
));
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
place_
,
max_memory_size
));
}
}
}
}
#elif defined(PADDLE_WITH_ASCEND_CL)
if
(
IsFastEagerDeletionModeEnabled
())
{
VLOG
(
4
)
<<
"Use unsafe fast gc for NPU."
;
gc
.
reset
(
new
NPUUnsafeFastGarbageCollector
(
place_
,
max_memory_size
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."
));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG
(
4
)
<<
"Use default stream gc for NPU."
;
gc
.
reset
(
new
NPUDefaultStreamGarbageCollector
(
place_
,
max_memory_size
));
}
#endif
#endif
}
// max_memory_size >= 0
}
// max_memory_size >= 0
...
...
paddle/fluid/framework/tensor_test.cc
浏览文件 @
0b60f28c
...
@@ -143,35 +143,6 @@ TEST(DenseTensor, MutableData) {
...
@@ -143,35 +143,6 @@ TEST(DenseTensor, MutableData) {
EXPECT_EQ
(
p1
,
p2
);
EXPECT_EQ
(
p1
,
p2
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
phi
::
DenseTensor
src_tensor
;
float
*
p1
=
nullptr
;
float
*
p2
=
nullptr
;
// initialization
p1
=
src_tensor
.
mutable_data
<
float
>
(
phi
::
make_ddim
({
1
,
2
,
3
}),
platform
::
NPUPlace
(
0
));
auto
p1_holder
=
src_tensor
.
Holder
();
EXPECT_NE
(
p1
,
nullptr
);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2
=
src_tensor
.
mutable_data
<
float
>
(
phi
::
make_ddim
({
3
,
1024
}),
platform
::
NPUPlace
(
0
));
auto
p2_holder
=
src_tensor
.
Holder
();
EXPECT_NE
(
p2
,
nullptr
);
EXPECT_NE
(
p1_holder
.
get
(),
p2_holder
.
get
());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1
=
src_tensor
.
mutable_data
<
float
>
(
phi
::
make_ddim
({
2
,
2
,
3
}),
platform
::
NPUPlace
(
0
));
EXPECT_EQ
(
p1
,
p2
);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2
=
src_tensor
.
mutable_data
<
float
>
(
phi
::
make_ddim
({
2
,
2
}),
platform
::
NPUPlace
(
0
));
EXPECT_EQ
(
p1
,
p2
);
}
#endif
}
}
TEST
(
DenseTensor
,
ShareDataWith
)
{
TEST
(
DenseTensor
,
ShareDataWith
)
{
...
@@ -207,16 +178,6 @@ TEST(DenseTensor, ShareDataWith) {
...
@@ -207,16 +178,6 @@ TEST(DenseTensor, ShareDataWith) {
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
phi
::
DenseTensor
src_tensor
;
phi
::
DenseTensor
dst_tensor
;
src_tensor
.
mutable_data
<
int
>
(
phi
::
make_ddim
({
2
,
3
,
4
}),
platform
::
NPUPlace
(
0
));
dst_tensor
.
ShareDataWith
(
src_tensor
);
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
}
#endif
}
}
TEST
(
DenseTensor
,
Slice
)
{
TEST
(
DenseTensor
,
Slice
)
{
...
@@ -271,33 +232,6 @@ TEST(DenseTensor, Slice) {
...
@@ -271,33 +232,6 @@ TEST(DenseTensor, Slice) {
EXPECT_EQ
(
src_data_address
+
9
*
2
*
sizeof
(
double
),
slice_data_address
);
EXPECT_EQ
(
src_data_address
+
9
*
2
*
sizeof
(
double
),
slice_data_address
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
phi
::
DenseTensor
src_tensor
;
src_tensor
.
mutable_data
<
double
>
(
phi
::
make_ddim
({
6
,
9
}),
platform
::
NPUPlace
(
0
));
phi
::
DenseTensor
slice_tensor
=
src_tensor
.
Slice
(
2
,
6
);
phi
::
DDim
slice_dims
=
slice_tensor
.
dims
();
ASSERT_EQ
(
arity
(
slice_dims
),
2
);
EXPECT_EQ
(
slice_dims
[
0
],
4
);
EXPECT_EQ
(
slice_dims
[
1
],
9
);
uintptr_t
src_data_address
=
reinterpret_cast
<
uintptr_t
>
(
src_tensor
.
data
<
double
>
());
uintptr_t
src_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
src_tensor
.
mutable_data
<
double
>
(
src_tensor
.
dims
(),
platform
::
NPUPlace
(
0
)));
uintptr_t
slice_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
data
<
double
>
());
uintptr_t
slice_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
mutable_data
<
double
>
(
slice_tensor
.
dims
(),
platform
::
NPUPlace
(
0
)));
EXPECT_EQ
(
src_data_address
,
src_mutable_data_address
);
EXPECT_EQ
(
slice_data_address
,
slice_mutable_data_address
);
EXPECT_EQ
(
src_data_address
+
9
*
2
*
sizeof
(
double
),
slice_data_address
);
}
#endif
}
}
TEST
(
DenseTensor
,
ReshapeToMatrix
)
{
TEST
(
DenseTensor
,
ReshapeToMatrix
)
{
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
0b60f28c
...
@@ -125,112 +125,6 @@ void TensorCopyImpl(const TENSOR& src,
...
@@ -125,112 +125,6 @@ void TensorCopyImpl(const TENSOR& src,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(zhiqiu): handle different condition like CUDA code below
else
if
(
platform
::
is_npu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
auto
stream
=
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_npu_place
(
dst_place
))
{
// 1. cpu tensor -> npu pinned tensor
platform
::
NPUPinnedPlace
npu_pinned_place
;
phi
::
DenseTensor
npu_pinned_tensor
;
npu_pinned_tensor
.
Resize
(
src
.
dims
());
auto
npu_pinned_ptr
=
npu_pinned_tensor
.
mutable_data
(
npu_pinned_place
,
src
.
dtype
());
memory
::
Copy
(
npu_pinned_place
,
npu_pinned_ptr
,
src_place
,
src_ptr
,
size
);
// 2. async copy npu pinned tensor -> npu tensor
memory
::
Copy
(
dst_place
,
dst_ptr
,
npu_pinned_place
,
npu_pinned_ptr
,
size
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
// 3. record event
auto
npu_pinned_allocator
=
static_cast
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
*>
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
npu_pinned_place
)
.
get
());
phi
::
Allocation
*
allocation
=
npu_pinned_tensor
.
Holder
().
get
();
npu_pinned_allocator
->
RecordEvent
(
allocation
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
}
else
if
(
platform
::
is_npu_place
(
src_place
)
&&
// NOLINT
platform
::
is_npu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
auto
stream
=
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_npu_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_npu_place
(
dst_place
))
{
/* npu_pinned->npu */
auto
src_npu_pinned_place
=
src_place
;
auto
dst_npu_place
=
dst_place
;
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_npu_place
(
ctx_place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Device context place mismatch. When copying phi::DenseTensor "
"data from NPU Pinned memory to NPU memory, current "
"device context place should be NPU."
));
auto
ctx_npu_place
=
ctx_place
;
PADDLE_ENFORCE_EQ
(
dst_npu_place
,
ctx_npu_place
,
platform
::
errors
::
PreconditionNotMet
(
"The target NPU device and current device context do "
"not match. The target NPU device number is %d, but "
"device context NPU number is %d."
,
dst_npu_place
.
device
,
ctx_npu_place
.
device
));
auto
stream
=
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_npu_place
,
dst_ptr
,
src_npu_pinned_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_npu_place
(
src_place
)
&&
// NOLINT
platform
::
is_npu_pinned_place
(
dst_place
))
{
/* npu->npu_pinned */
auto
src_npu_place
=
src_place
;
auto
dst_npu_pinned_place
=
dst_place
;
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_npu_place
(
ctx_place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Device context place mismatch. When copying phi::DenseTensor "
"data from NPU memory to NPU Pinned memory, current "
"device context place should be NPU."
));
auto
ctx_npu_place
=
ctx_place
;
PADDLE_ENFORCE_EQ
(
src_place
,
ctx_npu_place
,
platform
::
errors
::
PreconditionNotMet
(
"The source NPU device and current device context do "
"not match. The source NPU device number is %d, but "
"device context NPU number is %d."
,
src_npu_place
.
device
,
ctx_npu_place
.
device
));
auto
stream
=
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_npu_pinned_place
,
dst_ptr
,
src_npu_place
,
src_ptr
,
size
,
stream
);
}
else
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
platform
::
is_cuda_pinned_place
(
dst_place
))
{
...
@@ -539,29 +433,6 @@ void TensorCopySync(const phi::DenseTensor& src,
...
@@ -539,29 +433,6 @@ void TensorCopySync(const phi::DenseTensor& src,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
platform
::
is_npu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
/* npu -> cpu*/
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_npu_place
(
dst_place
))
{
/* cpu -> npu*/
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_npu_place
(
src_place
)
&&
// NOLINT
platform
::
is_npu_place
(
dst_place
))
{
/* npu -> npu*/
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data sync from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
else
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
platform
::
is_cuda_pinned_place
(
dst_place
))
{
...
@@ -758,31 +629,6 @@ void TensorToStream(std::ostream& os,
...
@@ -758,31 +629,6 @@ void TensorToStream(std::ostream& os,
#else
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"MLUPlace is not supported when not compiled with MLU"
));
"MLUPlace is not supported when not compiled with MLU"
));
#endif
}
else
if
(
platform
::
is_npu_place
(
tensor
.
place
()))
{
#ifdef PADDLE_WITH_ASCEND_CL
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
npu_dev_ctx
=
static_cast
<
const
platform
::
NPUDeviceContext
&>
(
dev_ctx
);
platform
::
CPUPlace
cpu
;
uintptr_t
data
=
reinterpret_cast
<
uintptr_t
>
(
data_ptr
);
while
(
size
!=
0
)
{
size_t
size_to_write
=
std
::
min
(
kBufSize
,
static_cast
<
size_t
>
(
size
));
memory
::
Copy
(
cpu
,
buf
.
get
(),
tensor
.
place
(),
reinterpret_cast
<
const
void
*>
(
data
),
size_to_write
,
npu_dev_ctx
.
stream
());
npu_dev_ctx
.
Wait
();
os
.
write
(
buf
.
get
(),
size_to_write
);
data
+=
size_to_write
;
size
-=
size_to_write
;
}
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported when not compiled with NPU"
));
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
tensor
.
place
()))
{
}
else
if
(
platform
::
is_custom_place
(
tensor
.
place
()))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
@@ -875,7 +721,7 @@ void TensorFromStream(std::istream& is,
...
@@ -875,7 +721,7 @@ void TensorFromStream(std::istream& is,
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
CUSTOM_DEVICE)
defined(PADDLE_WITH_CUSTOM_DEVICE)
phi
::
DenseTensor
cpu_tensor
;
phi
::
DenseTensor
cpu_tensor
;
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
shape
));
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
shape
));
framework
::
VisitDataType
(
framework
::
VisitDataType
(
...
@@ -958,7 +804,7 @@ void TensorFromStream(std::istream& is,
...
@@ -958,7 +804,7 @@ void TensorFromStream(std::istream& is,
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
CUSTOM_DEVICE)
defined(PADDLE_WITH_CUSTOM_DEVICE)
phi
::
DenseTensor
cpu_tensor
;
phi
::
DenseTensor
cpu_tensor
;
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
dims
));
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
dims
));
framework
::
VisitDataType
(
framework
::
VisitDataType
(
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
0b60f28c
...
@@ -25,9 +25,6 @@ limitations under the License. */
...
@@ -25,9 +25,6 @@ limitations under the License. */
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
...
@@ -145,37 +142,6 @@ void TensorFromArray(const T* src,
...
@@ -145,37 +142,6 @@ void TensorFromArray(const T* src,
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
platform
::
is_npu_place
(
dst_place
))
{
// NOLINT
// 1. vector -> npu pinned tensor
platform
::
NPUPinnedPlace
npu_pinned_place
;
phi
::
DenseTensor
npu_pinned_tensor
;
npu_pinned_tensor
.
Resize
(
dst
->
dims
());
auto
npu_pinned_ptr
=
npu_pinned_tensor
.
mutable_data
(
npu_pinned_place
,
dst
->
dtype
());
memory
::
Copy
(
npu_pinned_place
,
npu_pinned_ptr
,
src_place
,
src_ptr
,
size
);
// 2. async copy npu pinned tensor -> npu tensor
memory
::
Copy
(
dst_place
,
dst_ptr
,
npu_pinned_place
,
npu_pinned_ptr
,
size
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
// 3. record event
auto
npu_pinned_allocator
=
static_cast
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
*>
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
npu_pinned_place
)
.
get
());
phi
::
Allocation
*
allocation
=
npu_pinned_tensor
.
Holder
().
get
();
npu_pinned_allocator
->
RecordEvent
(
allocation
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
dst_place
))
{
// NOLINT
else
if
(
platform
::
is_mlu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
...
@@ -227,42 +193,6 @@ void TensorFromVector(const std::vector<T>& src,
...
@@ -227,42 +193,6 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
// cudaMemcpyAsync.
// cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
// aclrtMemcpyAsync is really "async" between cpu <-> npu.
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else
if
(
platform
::
is_npu_place
(
dst_place
))
{
// NOLINT
// 1. vector -> npu pinned tensor
phi
::
DenseTensor
npu_pinned_tensor
(
dst
->
dtype
());
platform
::
NPUPinnedPlace
npu_pinned_place
;
auto
npu_pinned_ptr
=
npu_pinned_tensor
.
mutable_data
<
T
>
(
dst
->
dims
(),
npu_pinned_place
);
memory
::
Copy
(
npu_pinned_place
,
npu_pinned_ptr
,
src_place
,
src_ptr
,
size
);
// 2. async copy npu pinned tensor -> npu tensor
memory
::
Copy
(
dst_place
,
dst_ptr
,
npu_pinned_place
,
npu_pinned_ptr
,
size
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
// 3. record event
auto
npu_pinned_allocator
=
static_cast
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
*>
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
npu_pinned_place
)
.
get
());
phi
::
Allocation
*
allocation
=
npu_pinned_tensor
.
Holder
().
get
();
npu_pinned_allocator
->
RecordEvent
(
allocation
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
dst_place
))
{
// NOLINT
else
if
(
platform
::
is_mlu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
...
@@ -324,37 +254,6 @@ inline void TensorFromVector(const std::vector<bool>& src,
...
@@ -324,37 +254,6 @@ inline void TensorFromVector(const std::vector<bool>& src,
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
platform
::
is_npu_place
(
dst_place
))
{
// NOLINT
// 1. vector -> npu pinned tensor
platform
::
NPUPinnedPlace
npu_pinned_place
;
phi
::
DenseTensor
npu_pinned_tensor
;
npu_pinned_tensor
.
Resize
(
dst
->
dims
());
auto
npu_pinned_ptr
=
npu_pinned_tensor
.
mutable_data
(
npu_pinned_place
,
dst
->
dtype
());
memory
::
Copy
(
npu_pinned_place
,
npu_pinned_ptr
,
src_place
,
src_ptr
,
size
);
// 2. async copy npu pinned tensor -> npu tensor
memory
::
Copy
(
dst_place
,
dst_ptr
,
npu_pinned_place
,
npu_pinned_ptr
,
size
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
// 3. record event
auto
npu_pinned_allocator
=
static_cast
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
*>
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
npu_pinned_place
)
.
get
());
phi
::
Allocation
*
allocation
=
npu_pinned_tensor
.
Holder
().
get
();
npu_pinned_allocator
->
RecordEvent
(
allocation
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else
if
(
platform
::
is_custom_place
(
dst_place
))
{
// NOLINT
else
if
(
platform
::
is_custom_place
(
dst_place
))
{
// NOLINT
auto
stream
=
auto
stream
=
...
@@ -433,11 +332,6 @@ void TensorToVector(const phi::DenseTensor& src,
...
@@ -433,11 +332,6 @@ void TensorToVector(const phi::DenseTensor& src,
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
platform
::
is_npu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
src
.
place
()))
{
// NOLINT
else
if
(
platform
::
is_mlu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
memory
::
Copy
(
...
@@ -491,11 +385,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
...
@@ -491,11 +385,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
platform
::
is_npu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
src
.
place
()))
{
// NOLINT
else
if
(
platform
::
is_mlu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
...
@@ -566,11 +455,6 @@ inline T GetValue(const phi::DenseTensor* x) {
...
@@ -566,11 +455,6 @@ inline T GetValue(const phi::DenseTensor* x) {
if
(
!
platform
::
is_cpu_place
(
x
->
place
()))
{
if
(
!
platform
::
is_cpu_place
(
x
->
place
()))
{
phi
::
DenseTensor
cpu_x
;
phi
::
DenseTensor
cpu_x
;
framework
::
TensorCopy
(
*
x
,
platform
::
CPUPlace
(),
&
cpu_x
);
framework
::
TensorCopy
(
*
x
,
platform
::
CPUPlace
(),
&
cpu_x
);
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
const
platform
::
DeviceContext
*
dev_ctx
=
pool
.
Get
(
x
->
place
());
dev_ctx
->
Wait
();
#endif
value
=
cpu_x
.
data
<
T
>
()[
0
];
value
=
cpu_x
.
data
<
T
>
()[
0
];
}
else
{
}
else
{
value
=
x
->
data
<
T
>
()[
0
];
value
=
x
->
data
<
T
>
()[
0
];
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
0b60f28c
...
@@ -299,32 +299,6 @@ TEST(TensorToVector, Tensor_bool) {
...
@@ -299,32 +299,6 @@ TEST(TensorToVector, Tensor_bool) {
}
}
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
std
::
vector
<
bool
>
src_vec
=
{
false
,
true
,
false
,
true
,
false
,
true
,
false
,
true
,
false
,
};
phi
::
DenseTensor
npu_tensor
;
paddle
::
platform
::
NPUPlace
place
(
0
);
paddle
::
platform
::
NPUDeviceContext
npu_ctx
(
place
);
paddle
::
framework
::
TensorFromVector
<
bool
>
(
src_vec
,
npu_ctx
,
&
npu_tensor
);
std
::
vector
<
bool
>
dst
;
paddle
::
framework
::
TensorToVector
<
bool
>
(
npu_tensor
,
npu_ctx
,
&
dst
);
for
(
int
i
=
0
;
i
<
3
*
3
;
++
i
)
{
EXPECT_EQ
(
src_vec
[
i
],
dst
[
i
]);
}
}
#endif
}
}
TEST
(
TensorFromDLPack
,
Tensor
)
{
TEST
(
TensorFromDLPack
,
Tensor
)
{
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
0b60f28c
...
@@ -302,8 +302,7 @@ class PSGPUTrainer : public TrainerBase {
...
@@ -302,8 +302,7 @@ class PSGPUTrainer : public TrainerBase {
};
};
#endif
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
class
PipelineTrainer
:
public
TrainerBase
{
class
PipelineTrainer
:
public
TrainerBase
{
public:
public:
PipelineTrainer
()
{}
PipelineTrainer
()
{}
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
0b60f28c
...
@@ -82,8 +82,7 @@ REGISTER_TRAINER_CLASS(HeterXpuTrainer);
...
@@ -82,8 +82,7 @@ REGISTER_TRAINER_CLASS(HeterXpuTrainer);
(defined PADDLE_WITH_PSLIB)
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
PSGPUTrainer
);
REGISTER_TRAINER_CLASS
(
PSGPUTrainer
);
#endif
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
defined(PADDLE_WITH_ASCEND_CL)
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
#endif
#endif
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/type_defs.h
浏览文件 @
0b60f28c
...
@@ -65,28 +65,6 @@ using Attribute = paddle::variant<paddle::blank,
...
@@ -65,28 +65,6 @@ using Attribute = paddle::variant<paddle::blank,
std
::
vector
<
paddle
::
experimental
::
Scalar
>>
;
std
::
vector
<
paddle
::
experimental
::
Scalar
>>
;
using
AttributeMap
=
std
::
unordered_map
<
std
::
string
,
Attribute
>
;
using
AttributeMap
=
std
::
unordered_map
<
std
::
string
,
Attribute
>
;
#ifdef PADDLE_WITH_ASCEND_CL
using
NPUAttribute
=
paddle
::
variant
<
paddle
::
blank
,
int
,
float
,
std
::
string
,
std
::
vector
<
int
>
,
std
::
vector
<
float
>
,
std
::
vector
<
std
::
string
>
,
bool
,
std
::
vector
<
bool
>
,
BlockDesc
*
,
int64_t
,
std
::
vector
<
BlockDesc
*>
,
std
::
vector
<
int64_t
>
,
std
::
vector
<
double
>
,
VarDesc
*
,
std
::
vector
<
VarDesc
*>
,
std
::
vector
<
std
::
vector
<
int64_t
>>>
;
using
NPUAttributeMap
=
std
::
unordered_map
<
std
::
string
,
NPUAttribute
>
;
#endif
using
OpCreator
=
using
OpCreator
=
std
::
function
<
OperatorBase
*
(
const
std
::
string
&
/*type*/
,
std
::
function
<
OperatorBase
*
(
const
std
::
string
&
/*type*/
,
const
VariableNameMap
&
/*inputs*/
,
const
VariableNameMap
&
/*inputs*/
,
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
0b60f28c
...
@@ -39,11 +39,6 @@
...
@@ -39,11 +39,6 @@
#endif
#endif
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include <hccl/hccl.h>
#include <hccl/hccl_types.h>
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#include "xpu/bkcl.h"
#endif
#endif
...
@@ -69,10 +64,6 @@ class Communicator;
...
@@ -69,10 +64,6 @@ class Communicator;
class
NCCLCommunicator
;
class
NCCLCommunicator
;
#endif
#endif
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class
Communicator
;
class
HCCLCommunicator
;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_XPU_BKCL)
class
BKCLCommunicator
;
class
BKCLCommunicator
;
...
@@ -205,9 +196,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
...
@@ -205,9 +196,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#endif
#endif
operators
::
CudnnRNNCache
,
operators
::
CudnnRNNCache
,
#endif
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo
,
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId
,
BKCLUniqueId
,
platform
::
BKCLCommunicator
,
platform
::
BKCLCommunicator
,
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
浏览文件 @
0b60f28c
...
@@ -36,49 +36,6 @@ namespace paddle {
...
@@ -36,49 +36,6 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
#ifdef PADDLE_WITH_ASCEND_CL
void
IrParamsSyncAmongDevicesPass
::
CopyParamsToNpu
(
Argument
*
argument
)
{
if
(
!
argument
->
use_npu
())
return
;
auto
&
graph
=
argument
->
main_graph
();
std
::
vector
<
std
::
string
>
repetitive_params
;
if
(
graph
.
Has
(
framework
::
ir
::
kRepetitiveParamAttr
))
repetitive_params
=
graph
.
Get
<
std
::
vector
<
std
::
string
>>
(
framework
::
ir
::
kRepetitiveParamAttr
);
LOG
(
INFO
)
<<
"Sync params from CPU to NPU"
;
PADDLE_ENFORCE_EQ
(
argument
->
npu_device_id_valid
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The npu_device_id field should be valid"
));
platform
::
Place
place
=
platform
::
NPUPlace
(
argument
->
npu_device_id
());
auto
*
scope
=
argument
->
scope_ptr
();
std
::
vector
<
std
::
string
>
all_vars
=
scope
->
LocalVarNames
();
for
(
auto
&
var_name
:
all_vars
)
{
auto
*
var
=
scope
->
FindLocalVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
platform
::
errors
::
PreconditionNotMet
(
"The var should not be nullptr"
));
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
*
t
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
platform
::
CPUPlace
cpu_place
;
phi
::
DenseTensor
temp_tensor
;
temp_tensor
.
Resize
(
t
->
dims
());
temp_tensor
.
mutable_data
<
float
>
(
cpu_place
);
paddle
::
framework
::
TensorCopySync
(
*
t
,
cpu_place
,
&
temp_tensor
);
t
->
clear
();
paddle
::
framework
::
TensorCopySync
(
temp_tensor
,
place
,
t
);
}
}
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
IrParamsSyncAmongDevicesPass
::
CopyParamsToGpu
(
Argument
*
argument
)
{
void
IrParamsSyncAmongDevicesPass
::
CopyParamsToGpu
(
Argument
*
argument
)
{
// The parameters are on the cpu, therefore, synchronization is not necessary.
// The parameters are on the cpu, therefore, synchronization is not necessary.
...
@@ -253,11 +210,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
...
@@ -253,11 +210,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
argument
->
scope_valid
(),
argument
->
scope_valid
(),
true
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"The scope field should be valid"
));
platform
::
errors
::
PreconditionNotMet
(
"The scope field should be valid"
));
#ifdef PADDLE_WITH_ASCEND_CL
if
(
argument
->
use_npu_valid
())
{
CopyParamsToNpu
(
argument
);
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
argument
->
use_gpu_valid
())
{
if
(
argument
->
use_gpu_valid
())
{
CopyParamsToGpu
(
argument
);
CopyParamsToGpu
(
argument
);
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
浏览文件 @
0b60f28c
...
@@ -35,10 +35,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
...
@@ -35,10 +35,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
std
::
string
repr
()
const
override
;
std
::
string
repr
()
const
override
;
private:
private:
#ifdef PADDLE_WITH_ASCEND_CL
void
CopyParamsToNpu
(
Argument
*
argument
);
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
CopyParamsToGpu
(
Argument
*
argument
);
void
CopyParamsToGpu
(
Argument
*
argument
);
#endif
#endif
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
0b60f28c
...
@@ -195,21 +195,6 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
...
@@ -195,21 +195,6 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
Update
();
Update
();
}
}
void
AnalysisConfig
::
EnableNpu
(
int
device_id
)
{
#if defined(PADDLE_WITH_ASCEND_CL)
use_npu_
=
true
;
npu_device_id_
=
device_id
;
#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
use_custom_device_
=
true
;
custom_device_id_
=
device_id
;
custom_device_type_
=
"npu"
;
#else
LOG
(
ERROR
)
<<
"Please compile with npu to EnableNpu()"
;
use_npu_
=
false
;
#endif
Update
();
}
void
AnalysisConfig
::
EnableCustomDevice
(
const
std
::
string
&
device_type
,
void
AnalysisConfig
::
EnableCustomDevice
(
const
std
::
string
&
device_type
,
int
device_id
,
int
device_id
,
Precision
precision_mode
)
{
Precision
precision_mode
)
{
...
@@ -1023,20 +1008,6 @@ void AnalysisConfig::Update() {
...
@@ -1023,20 +1008,6 @@ void AnalysisConfig::Update() {
"with XPU-runtime."
));
"with XPU-runtime."
));
#endif
#endif
}
}
if
(
use_npu_
)
{
#if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU)
PADDLE_ENFORCE_EQ
(
use_gpu_
,
false
,
platform
::
errors
::
Unavailable
(
"Currently, NPU and GPU cannot be enabled in the "
"same analysis configuration."
));
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"You tried to use an NPU device, but Paddle was not compiled "
"with NPU-runtime."
));
#endif
}
if
(
use_ipu_
)
{
if
(
use_ipu_
)
{
#ifndef PADDLE_WITH_IPU
#ifndef PADDLE_WITH_IPU
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
0b60f28c
...
@@ -376,14 +376,6 @@ void AnalysisPredictor::InitPlace() {
...
@@ -376,14 +376,6 @@ void AnalysisPredictor::InitPlace() {
"with WITH_XPU."
));
"with WITH_XPU."
));
#endif // PADDLE_WITH_XPU
#endif // PADDLE_WITH_XPU
}
}
}
else
if
(
config_
.
use_npu
())
{
#ifdef PADDLE_WITH_ASCEND_CL
place_
=
paddle
::
platform
::
NPUPlace
(
config_
.
npu_device_id
());
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"You tried to use NPU forward propagation, but Paddle was not compiled "
"with WITH_ASCEND_CL."
));
#endif
}
else
if
(
config_
.
NNAdapter
().
use_nnadapter
)
{
}
else
if
(
config_
.
NNAdapter
().
use_nnadapter
)
{
if
(
config_
.
lite_engine_enabled
())
{
if
(
config_
.
lite_engine_enabled
())
{
place_
=
paddle
::
platform
::
CPUPlace
();
place_
=
paddle
::
platform
::
CPUPlace
();
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
0b60f28c
...
@@ -278,23 +278,6 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
...
@@ -278,23 +278,6 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#else
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not compile with XPU, should not reach here."
));
"Not compile with XPU, should not reach here."
));
#endif
}
else
{
#ifdef PADDLE_WITH_ASCEND_CL
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
const
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
place_
));
auto
dst_npu_place
=
place_
;
memory
::
Copy
(
dst_npu_place
,
static_cast
<
void
*>
(
input_ptr
),
platform
::
CPUPlace
(),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
(),
dev_ctx
->
stream
());
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not compile with NPU, should not reach here."
));
#endif
#endif
}
}
...
...
paddle/fluid/inference/api/api_impl_tester.cc
浏览文件 @
0b60f28c
...
@@ -305,15 +305,6 @@ TEST(inference_api_native, image_classification_xpu) {
...
@@ -305,15 +305,6 @@ TEST(inference_api_native, image_classification_xpu) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST
(
inference_api_native
,
word2vec_npu
)
{
MainWord2Vec
(
paddle
::
PaddlePlace
::
kNPU
);
}
// TEST(inference_api_native, image_classification_npu) {
// MainImageClassification(paddle::PaddlePlace::kNPU);
// }
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
inference_api_native
,
word2vec_gpu
)
{
TEST
(
inference_api_native
,
word2vec_gpu
)
{
MainWord2Vec
(
paddle
::
PaddlePlace
::
kGPU
);
MainWord2Vec
(
paddle
::
PaddlePlace
::
kGPU
);
...
...
paddle/fluid/inference/api/details/zero_copy_tensor.cc
浏览文件 @
0b60f28c
...
@@ -244,25 +244,6 @@ void Tensor::CopyFromCpu(const T *data) {
...
@@ -244,25 +244,6 @@ void Tensor::CopyFromCpu(const T *data) {
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Unavailable
(
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Unavailable
(
"Can not create tensor with XPU place because paddle is not compiled "
"Can not create tensor with XPU place because paddle is not compiled "
"with XPU."
));
"with XPU."
));
#endif
}
else
if
(
place_
==
PlaceType
::
kNPU
)
{
#ifdef PADDLE_WITH_ASCEND_CL
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
paddle
::
platform
::
NPUPlace
npu_place
(
device_
);
auto
*
t_data
=
tensor
->
mutable_data
<
T
>
(
npu_place
);
auto
*
dev_ctx
=
static_cast
<
const
paddle
::
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
npu_place
));
paddle
::
memory
::
Copy
(
npu_place
,
static_cast
<
void
*>
(
t_data
),
paddle
::
platform
::
CPUPlace
(),
data
,
ele_size
,
dev_ctx
->
stream
());
#else
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Unavailable
(
"Can not create tensor with NPU place because paddle is not compiled "
"with NPU."
));
#endif
#endif
}
else
{
}
else
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
@@ -468,25 +449,6 @@ void Tensor::CopyToCpuImpl(T *data,
...
@@ -468,25 +449,6 @@ void Tensor::CopyToCpuImpl(T *data,
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Unavailable
(
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Unavailable
(
"Can not create tensor with XPU place because paddle is not compiled "
"Can not create tensor with XPU place because paddle is not compiled "
"with XPU."
));
"with XPU."
));
#endif
}
else
if
(
place_
==
PlaceType
::
kNPU
)
{
#ifdef PADDLE_WITH_ASCEND_CL
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
npu_place
=
t_place
;
auto
*
dev_ctx
=
static_cast
<
const
paddle
::
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
npu_place
));
paddle
::
memory
::
Copy
(
paddle
::
platform
::
CPUPlace
(),
static_cast
<
void
*>
(
data
),
npu_place
,
t_data
,
ele_num
*
sizeof
(
T
),
dev_ctx
->
stream
());
paddle
::
platform
::
NPUStreamSync
(
dev_ctx
->
stream
());
#else
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Unavailable
(
"Can not create tensor with NPU place because paddle is not compiled "
"with NPU."
));
#endif
#endif
}
else
{
}
else
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
...
paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
浏览文件 @
0b60f28c
...
@@ -146,10 +146,6 @@ TEST(Tensor, FillRandomDataAndCheck) {
...
@@ -146,10 +146,6 @@ TEST(Tensor, FillRandomDataAndCheck) {
ASSERT_TRUE
(
FillRandomDataAndCheck
(
PlaceType
::
kGPU
));
ASSERT_TRUE
(
FillRandomDataAndCheck
(
PlaceType
::
kGPU
));
ASSERT_TRUE
(
SetPlaceAndCheck
(
PlaceType
::
kGPU
));
ASSERT_TRUE
(
SetPlaceAndCheck
(
PlaceType
::
kGPU
));
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
ASSERT_TRUE
(
FillRandomDataAndCheck
(
PlaceType
::
kNPU
));
ASSERT_TRUE
(
SetPlaceAndCheck
(
PlaceType
::
kNPU
));
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
ASSERT_TRUE
(
FillRandomDataAndCheck
(
PlaceType
::
kXPU
));
ASSERT_TRUE
(
FillRandomDataAndCheck
(
PlaceType
::
kXPU
));
ASSERT_TRUE
(
SetPlaceAndCheck
(
PlaceType
::
kXPU
));
ASSERT_TRUE
(
SetPlaceAndCheck
(
PlaceType
::
kXPU
));
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
0b60f28c
...
@@ -363,12 +363,6 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -363,12 +363,6 @@ struct PD_INFER_DECL AnalysisConfig {
///
///
void
SetXpuDeviceId
(
int
device_id
=
0
);
void
SetXpuDeviceId
(
int
device_id
=
0
);
///
///
/// \brief Turn on NPU.
///
/// \param device_id device_id the NPU card to use (default is 0).
///
void
EnableNpu
(
int
device_id
=
0
);
///
/// \brief Turn on CustomDevice.
/// \brief Turn on CustomDevice.
///
///
/// \param device_type device_type the custom device to use.
/// \param device_type device_type the custom device to use.
...
...
paddle/fluid/inference/capi_exp/pd_config.cc
浏览文件 @
0b60f28c
...
@@ -171,11 +171,6 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
...
@@ -171,11 +171,6 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
enable_multi_stream
);
enable_multi_stream
);
}
}
void
PD_ConfigEnableNpu
(
__pd_keep
PD_Config
*
pd_config
,
int32_t
device_id
)
{
CHECK_AND_CONVERT_PD_CONFIG
;
config
->
EnableNpu
(
device_id
);
}
PD_Bool
PD_ConfigUseXpu
(
__pd_keep
PD_Config
*
pd_config
)
{
PD_Bool
PD_ConfigUseXpu
(
__pd_keep
PD_Config
*
pd_config
)
{
CHECK_AND_CONVERT_PD_CONFIG
;
CHECK_AND_CONVERT_PD_CONFIG
;
return
config
->
use_xpu
();
return
config
->
use_xpu
();
...
...
paddle/fluid/inference/capi_exp/pd_config.h
浏览文件 @
0b60f28c
...
@@ -214,14 +214,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
...
@@ -214,14 +214,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
PD_Bool
adaptive_seqlen
,
PD_Bool
adaptive_seqlen
,
PD_Bool
enable_multi_stream
);
PD_Bool
enable_multi_stream
);
///
///
/// \brief Turn on NPU.
///
/// \param[in] pd_onfig config
/// \param[in] device_id device_id the NPU card to use.
///
PADDLE_CAPI_EXPORT
extern
void
PD_ConfigEnableNpu
(
__pd_keep
PD_Config
*
pd_config
,
int32_t
device_id
);
///
/// \brief A boolean state telling whether the XPU is turned on.
/// \brief A boolean state telling whether the XPU is turned on.
///
///
/// \param[in] pd_onfig config
/// \param[in] pd_onfig config
...
...
paddle/fluid/inference/goapi/config.go
浏览文件 @
0b60f28c
...
@@ -212,15 +212,6 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
...
@@ -212,15 +212,6 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
cAutotuneFile
,
cPrecision
,
cvtGoBoolToPD
(
adaptiveSeqlen
),
cvtGoBoolToPD
(
enableMultiStream
))
cAutotuneFile
,
cPrecision
,
cvtGoBoolToPD
(
adaptiveSeqlen
),
cvtGoBoolToPD
(
enableMultiStream
))
}
}
///
/// \brief Turn on NPU.
///
/// \param deviceId the NPU card to use.
///
func
(
config
*
Config
)
EnableNpu
(
deviceId
int32
)
{
C
.
PD_ConfigEnableNpu
(
config
.
c
,
C
.
int32_t
(
deviceId
))
}
///
///
/// \brief A boolean state telling whether the GPU is turned on.
/// \brief A boolean state telling whether the GPU is turned on.
///
///
...
...
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
0b60f28c
...
@@ -50,11 +50,6 @@ if(UNIX AND NOT APPLE)
...
@@ -50,11 +50,6 @@ if(UNIX AND NOT APPLE)
list
(
APPEND ALLOCATOR_DEPS rt
)
list
(
APPEND ALLOCATOR_DEPS rt
)
endif
()
endif
()
if
(
WITH_ASCEND_CL
)
list
(
APPEND ALLOCATOR_SRCS npu_allocator.cc npu_pinned_allocator.cc
)
list
(
APPEND ALLOCATOR_DEPS npu_info
)
endif
()
if
(
WITH_CUSTOM_DEVICE
)
if
(
WITH_CUSTOM_DEVICE
)
list
(
APPEND ALLOCATOR_SRCS custom_allocator.cc
)
list
(
APPEND ALLOCATOR_SRCS custom_allocator.cc
)
endif
()
endif
()
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
0b60f28c
...
@@ -54,10 +54,6 @@
...
@@ -54,10 +54,6 @@
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#endif
...
@@ -198,12 +194,6 @@ class AllocatorFacadePrivate {
...
@@ -198,12 +194,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetNPUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitNPUAllocator
(
platform
::
NPUPlace
(
dev_id
));
}
InitNaiveBestFitNPUPinnedAllocator
();
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetMLUDeviceCount
();
++
dev_id
)
{
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetMLUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitMLUAllocator
(
platform
::
MLUPlace
(
dev_id
));
InitNaiveBestFitMLUAllocator
(
platform
::
MLUPlace
(
dev_id
));
...
@@ -254,12 +244,6 @@ class AllocatorFacadePrivate {
...
@@ -254,12 +244,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCUDAPinnedAllocator
();
InitNaiveBestFitCUDAPinnedAllocator
();
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetNPUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitNPUAllocator
(
platform
::
NPUPlace
(
dev_id
));
}
InitNaiveBestFitNPUPinnedAllocator
();
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetXPUDeviceCount
();
++
dev_id
)
{
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetXPUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
...
@@ -823,17 +807,6 @@ class AllocatorFacadePrivate {
...
@@ -823,17 +807,6 @@ class AllocatorFacadePrivate {
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void
InitNaiveBestFitNPUAllocator
(
platform
::
NPUPlace
p
)
{
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
}
void
InitNaiveBestFitNPUPinnedAllocator
()
{
allocators_
[
platform
::
NPUPinnedPlace
()]
=
std
::
make_shared
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
>
();
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void
InitNaiveBestFitCustomDeviceAllocator
(
platform
::
CustomPlace
p
)
{
void
InitNaiveBestFitCustomDeviceAllocator
(
platform
::
CustomPlace
p
)
{
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
...
@@ -915,12 +888,6 @@ class AllocatorFacadePrivate {
...
@@ -915,12 +888,6 @@ class AllocatorFacadePrivate {
places
.
emplace_back
(
platform
::
XPUPlace
(
dev_id
));
places
.
emplace_back
(
platform
::
XPUPlace
(
dev_id
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
int
device_count
=
platform
::
GetNPUDeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
device_count
;
++
dev_id
)
{
places
.
emplace_back
(
platform
::
NPUPlace
(
dev_id
));
}
#endif
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
int
device_count
=
platform
::
GetIPUDeviceCount
();
int
device_count
=
platform
::
GetIPUDeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
device_count
;
++
dev_id
)
{
for
(
int
dev_id
=
0
;
dev_id
<
device_count
;
++
dev_id
)
{
...
@@ -1107,7 +1074,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
...
@@ -1107,7 +1074,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
}
else
{
}
else
{
return
m
->
GetAllocator
(
p
,
size
)
->
Allocate
(
size
);
return
m
->
GetAllocator
(
p
,
size
)
->
Allocate
(
size
);
}
}
#elif defined(PADDLE_WITH_XPU)
|| defined(PADDLE_WITH_ASCEND_CL)
#elif defined(PADDLE_WITH_XPU)
return
GetAllocator
(
place
)
->
Allocate
(
size
);
return
GetAllocator
(
place
)
->
Allocate
(
size
);
#else
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
...
...
paddle/fluid/memory/allocation/allocator_facade.h
浏览文件 @
0b60f28c
...
@@ -16,9 +16,6 @@
...
@@ -16,9 +16,6 @@
#include <memory>
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#endif
...
@@ -29,10 +26,6 @@ namespace paddle {
...
@@ -29,10 +26,6 @@ namespace paddle {
namespace
memory
{
namespace
memory
{
namespace
allocation
{
namespace
allocation
{
#ifdef PADDLE_WITH_ASCEND_CL
using
NPUPinnedAllocator
=
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
;
#endif
// Allocator Facade is the interface exposed to other modules.
// Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should
// All the configuration or dirty code under development should
// be hidden behind this facade.
// be hidden behind this facade.
...
...
paddle/fluid/memory/allocation/buddy_allocator.cc
浏览文件 @
0b60f28c
...
@@ -19,8 +19,7 @@ limitations under the License. */
...
@@ -19,8 +19,7 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "glog/logging.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
#define USE_DEVICE
#define USE_DEVICE
DECLARE_uint64
(
reallocate_gpu_memory_in_mb
);
DECLARE_uint64
(
reallocate_gpu_memory_in_mb
);
#endif
#endif
...
@@ -57,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
...
@@ -57,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
init_allocate_size_func_
=
&
platform
::
GpuInitAllocSize
;
init_allocate_size_func_
=
&
platform
::
GpuInitAllocSize
;
re_allocate_size_func_
=
&
platform
::
GpuReallocSize
;
re_allocate_size_func_
=
&
platform
::
GpuReallocSize
;
#elif defined(PADDLE_WITH_ASCEND_CL)
init_allocate_size_func_
=
&
platform
::
NPUInitAllocSize
;
re_allocate_size_func_
=
&
platform
::
NPUReallocSize
;
#elif defined(PADDLE_WITH_MLU)
#elif defined(PADDLE_WITH_MLU)
init_allocate_size_func_
=
&
platform
::
MLUInitAllocSize
;
init_allocate_size_func_
=
&
platform
::
MLUInitAllocSize
;
re_allocate_size_func_
=
&
platform
::
MLUReallocSize
;
re_allocate_size_func_
=
&
platform
::
MLUReallocSize
;
...
@@ -257,9 +253,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
...
@@ -257,9 +253,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allocate_bytes
=
DeviceAllocateSize
(
allocate_bytes
=
DeviceAllocateSize
(
&
platform
::
GpuInitAllocSize
,
&
platform
::
GpuReallocSize
,
request_bytes
);
&
platform
::
GpuInitAllocSize
,
&
platform
::
GpuReallocSize
,
request_bytes
);
#elif defined(PADDLE_WITH_ASCEND_CL)
allocate_bytes
=
DeviceAllocateSize
(
&
platform
::
NPUInitAllocSize
,
&
platform
::
NPUReallocSize
,
request_bytes
);
#elif defined(PADDLE_WITH_MLU)
#elif defined(PADDLE_WITH_MLU)
allocate_bytes
=
DeviceAllocateSize
(
allocate_bytes
=
DeviceAllocateSize
(
&
platform
::
MLUInitAllocSize
,
&
platform
::
MLUReallocSize
,
request_bytes
);
&
platform
::
MLUInitAllocSize
,
&
platform
::
MLUReallocSize
,
request_bytes
);
...
...
paddle/fluid/memory/allocation/buddy_allocator_test.cc
浏览文件 @
0b60f28c
...
@@ -29,8 +29,7 @@ limitations under the License. */
...
@@ -29,8 +29,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_uint64
(
initial_gpu_memory_in_mb
);
DECLARE_uint64
(
initial_gpu_memory_in_mb
);
DECLARE_uint64
(
reallocate_gpu_memory_in_mb
);
DECLARE_uint64
(
reallocate_gpu_memory_in_mb
);
...
@@ -396,34 +395,6 @@ TEST(BuddyAllocator, Release) {
...
@@ -396,34 +395,6 @@ TEST(BuddyAllocator, Release) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST
(
BuddyAllocator
,
NpuFraction
)
{
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use
=
0.92
;
FLAGS_initial_gpu_memory_in_mb
=
0
;
FLAGS_reallocate_gpu_memory_in_mb
=
0
;
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
NPUAllocator
(
0
)),
platform
::
NPUMinChunkSize
(),
platform
::
NPUMaxChunkSize
());
// Less than pool size
TestBuddyAllocator
(
&
buddy_allocator
,
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
20
);
buddy_allocator
.
Release
();
// Greater than max chunk size
TestBuddyAllocator
(
&
buddy_allocator
,
300
<<
20
,
/* use_system_allocator = */
true
);
TestBuddyAllocator
(
&
buddy_allocator
,
1
*
static_cast
<
size_t
>
(
1
<<
30
),
/* use_system_allocator = */
true
);
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
TEST
(
BuddyAllocator
,
MluFraction
)
{
TEST
(
BuddyAllocator
,
MluFraction
)
{
// In a 16 GB machine, the pool size will be about 160 MB
// In a 16 GB machine, the pool size will be about 160 MB
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
0b60f28c
...
@@ -213,210 +213,6 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
...
@@ -213,210 +213,6 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif
#endif
}
}
// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
constexpr
int
EXTRA_PADDING_SIZE
=
32
;
class
NPUBuddyAllocatorList
{
private:
NPUBuddyAllocatorList
()
:
devices_
(
platform
::
GetSelectedNPUDevices
())
{
auto
npu_num
=
devices_
.
size
();
allocators_
.
resize
(
npu_num
);
init_flags_
.
reserve
(
npu_num
);
for
(
size_t
i
=
0
;
i
<
npu_num
;
++
i
)
{
init_flags_
.
emplace_back
(
new
std
::
once_flag
());
}
}
static
NPUBuddyAllocatorList
*
CreateNewInstance
()
{
return
new
NPUBuddyAllocatorList
();
}
public:
static
NPUBuddyAllocatorList
*
Instance
()
{
static
auto
*
instance
=
CreateNewInstance
();
return
instance
;
}
BuddyAllocator
*
Get
(
int
npu_id
)
{
auto
pos
=
std
::
distance
(
devices_
.
begin
(),
std
::
find
(
devices_
.
begin
(),
devices_
.
end
(),
npu_id
));
PADDLE_ENFORCE_LT
(
pos
,
devices_
.
size
(),
platform
::
errors
::
OutOfRange
(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d"
,
devices_
.
size
(),
pos
));
std
::
call_once
(
*
init_flags_
[
pos
],
[
this
,
pos
]
{
platform
::
SetNPUDeviceId
(
devices_
[
pos
]);
allocators_
[
pos
].
reset
(
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
NPUAllocator
(
devices_
[
pos
])),
platform
::
NPUMinChunkSize
(),
platform
::
NPUMaxChunkSize
(),
EXTRA_PADDING_SIZE
));
VLOG
(
10
)
<<
"
\n\n
NOTE:
\n
"
<<
"You can set GFlags environment variable "
<<
"'FLAGS_fraction_of_gpu_memory_to_use' "
<<
"or 'FLAGS_initial_gpu_memory_in_mb' "
<<
"or 'FLAGS_reallocate_gpu_memory_in_mb' "
<<
"to change the memory size for GPU usage.
\n
"
<<
"Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<<
FLAGS_fraction_of_gpu_memory_to_use
<<
". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<<
FLAGS_initial_gpu_memory_in_mb
<<
". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<<
FLAGS_reallocate_gpu_memory_in_mb
<<
"
\n\n
"
;
});
return
allocators_
[
pos
].
get
();
}
private:
std
::
vector
<
int
>
devices_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
once_flag
>>
init_flags_
;
std
::
vector
<
std
::
unique_ptr
<
BuddyAllocator
>>
allocators_
;
};
BuddyAllocator
*
GetNPUBuddyAllocator
(
int
npu_id
)
{
return
NPUBuddyAllocatorList
::
Instance
()
->
Get
(
npu_id
);
}
BuddyAllocator
*
GetNPUPinnedBuddyAllocator
()
{
static
std
::
once_flag
init_flag
;
static
BuddyAllocator
*
ba
=
nullptr
;
std
::
call_once
(
init_flag
,
[]()
{
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
NPUPinnedAllocator
),
phi
::
backends
::
cpu
::
NPUPinnedMinChunkSize
(),
phi
::
backends
::
cpu
::
NPUPinnedMaxChunkSize
());
});
return
ba
;
}
#endif
template
<
>
size_t
Used
<
platform
::
NPUPlace
>
(
const
platform
::
NPUPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUBuddyAllocator
(
place
.
device
)
->
Used
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
*
Alloc
<
platform
::
NPUPlace
>
(
const
platform
::
NPUPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
auto
*
buddy_allocator
=
GetNPUBuddyAllocator
(
place
.
device
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
platform
::
NPUDeviceGuard
(
place
.
device
);
size_t
avail
,
total
;
platform
::
NPUMemoryUsage
(
&
avail
,
&
total
);
PADDLE_THROW
(
platform
::
errors
::
ResourceExhausted
(
"Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
"%s, NpuMaxChunkSize %s, NPU memory used: %s."
,
string
::
HumanReadableSize
(
size
),
place
.
device
,
string
::
HumanReadableSize
(
avail
),
string
::
HumanReadableSize
(
total
),
string
::
HumanReadableSize
(
buddy_allocator
->
GetMinChunkSize
()),
string
::
HumanReadableSize
(
buddy_allocator
->
GetMaxChunkSize
()),
string
::
HumanReadableSize
(
Used
<
platform
::
NPUPlace
>
(
place
))));
}
else
{
if
(
FLAGS_init_allocated_mem
)
{
platform
::
NPUMemsetSync
(
ptr
,
0xEF
,
size
,
size
);
}
}
VLOG
(
10
)
<<
"Allocate "
<<
size
<<
" bytes on "
<<
platform
::
Place
(
place
);
return
ptr
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
Free
<
platform
::
NPUPlace
>
(
const
platform
::
NPUPlace
&
place
,
void
*
p
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
VLOG
(
10
)
<<
"Free pointer="
<<
p
<<
" on "
<<
platform
::
Place
(
place
);
GetNPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
uint64_t
Release
<
platform
::
NPUPlace
>
(
const
platform
::
NPUPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUBuddyAllocator
(
place
.
device
)
->
Release
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
size_t
Used
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUPinnedBuddyAllocator
()
->
Used
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
*
Alloc
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
auto
*
buddy_allocator
=
GetNPUPinnedBuddyAllocator
();
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
LOG
(
WARNING
)
<<
"Cannot allocate "
<<
size
<<
" bytes in NPUPinnedPlace"
;
}
if
(
FLAGS_init_allocated_mem
)
{
memset
(
ptr
,
0xEF
,
size
);
}
return
ptr
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
Free
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
,
void
*
p
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator
()
->
Free
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
uint64_t
Release
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUPinnedBuddyAllocator
()
->
Release
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
// For CUDA
// For CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
GPUBuddyAllocatorList
{
class
GPUBuddyAllocatorList
{
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
浏览文件 @
0b60f28c
...
@@ -61,22 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
...
@@ -61,22 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST
(
NaiveBestFitAllocatorTest
,
NpuAlloc
)
{
NaiveBestFitAllocator
alloc
{
platform
::
NPUPlace
(
0
)};
{
size_t
size
=
(
1
<<
20
);
auto
allocation
=
alloc
.
Allocate
(
size
);
}
sleep
(
10
);
alloc
.
Release
(
platform
::
NPUPlace
(
0
));
size_t
size
=
(
1
<<
20
);
auto
allocation
=
alloc
.
Allocate
(
size
);
alloc
.
Release
(
platform
::
NPUPlace
(
0
));
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
TEST
(
NaiveBestFitAllocatorTest
,
MluAlloc
)
{
TEST
(
NaiveBestFitAllocatorTest
,
MluAlloc
)
{
NaiveBestFitAllocator
alloc
{
platform
::
MLUPlace
(
0
)};
NaiveBestFitAllocator
alloc
{
platform
::
MLUPlace
(
0
)};
...
...
paddle/fluid/memory/allocation/npu_allocator.cc
已删除
100644 → 0
浏览文件 @
04f8c24e
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/npu_allocator.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
bool
NPUAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
void
NPUAllocator
::
FreeImpl
(
phi
::
Allocation
*
allocation
)
{
PADDLE_ENFORCE_EQ
(
allocation
->
place
(),
place_
,
platform
::
errors
::
PermissionDenied
(
"NPU memory is freed in incorrect device. This may be a bug"
));
platform
::
RecordedNPUFree
(
allocation
->
ptr
(),
allocation
->
size
(),
place_
.
device
);
delete
allocation
;
}
phi
::
Allocation
*
NPUAllocator
::
AllocateImpl
(
size_t
size
)
{
std
::
call_once
(
once_flag_
,
[
this
]
{
platform
::
SetNPUDeviceId
(
place_
.
device
);
});
void
*
ptr
;
auto
result
=
platform
::
RecordedNPUMalloc
(
&
ptr
,
size
,
place_
.
device
);
if
(
LIKELY
(
result
==
ACL_ERROR_NONE
))
{
return
new
Allocation
(
ptr
,
size
,
platform
::
Place
(
place_
));
}
size_t
avail
,
total
,
actual_avail
,
actual_total
;
bool
is_limited
=
platform
::
RecordedNPUMemGetInfo
(
&
avail
,
&
total
,
&
actual_avail
,
&
actual_total
,
place_
.
device
);
std
::
string
err_msg
;
if
(
is_limited
)
{
auto
limit_size
=
(
total
>>
20
);
err_msg
=
string
::
Sprintf
(
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
"GPU memory usage is limited to %d MB.
\n
"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`."
,
limit_size
,
limit_size
);
}
PADDLE_THROW_BAD_ALLOC
(
platform
::
errors
::
ResourceExhausted
(
"
\n\n
Out of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.
\n\n
"
"Please check whether there is any other process using NPU %d.
\n
"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.
\n
"
"2. If no, please decrease the batch size of your model. %s
\n\n
"
,
place_
.
device
,
string
::
HumanReadableSize
(
size
),
place_
.
device
,
string
::
HumanReadableSize
(
avail
),
place_
.
device
,
err_msg
));
}
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
paddle/fluid/memory/allocation/npu_allocator.h
已删除
100644 → 0
浏览文件 @
04f8c24e
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
class
NPUAllocator
:
public
Allocator
{
public:
explicit
NPUAllocator
(
const
platform
::
NPUPlace
&
place
)
:
place_
(
place
)
{}
bool
IsAllocThreadSafe
()
const
override
;
protected:
void
FreeImpl
(
phi
::
Allocation
*
allocation
)
override
;
phi
::
Allocation
*
AllocateImpl
(
size_t
size
)
override
;
private:
platform
::
NPUPlace
place_
;
std
::
once_flag
once_flag_
;
};
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
已删除
100644 → 0
浏览文件 @
04f8c24e
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
void
NPUPinnedAllocator
::
ProcessEventsAndFree
()
{
for
(
auto
it
=
npu_events_
.
begin
();
it
!=
npu_events_
.
end
();)
{
aclrtEvent
event
=
it
->
second
;
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
platform
::
NPUEventQuery
(
event
,
&
status
);
if
(
status
==
ACL_EVENT_STATUS_COMPLETE
)
{
auto
*
allocation
=
it
->
first
;
void
*
ptr
=
allocation
->
ptr
();
free
(
ptr
);
npu_events_
.
erase
(
it
++
);
delete
allocation
;
platform
::
NPUEventDestroy
(
event
);
}
else
{
++
it
;
}
}
}
phi
::
Allocation
*
NPUPinnedAllocator
::
AllocateImpl
(
size_t
size
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
ProcessEventsAndFree
();
void
*
ptr
;
int
error
=
posix_memalign
(
&
ptr
,
kAlignment
,
size
);
PADDLE_ENFORCE_EQ
(
error
,
0
,
platform
::
errors
::
ResourceExhausted
(
"Fail to alloc memory of %ld size, error code is %d."
,
size
,
error
));
return
new
Allocation
(
ptr
,
size
,
platform
::
NPUPinnedPlace
());
}
void
NPUPinnedAllocator
::
FreeImpl
(
phi
::
Allocation
*
allocation
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
void
*
ptr
=
allocation
->
ptr
();
auto
iter
=
npu_events_
.
find
(
allocation
);
// Managed by GC if not called RecordEvent.
if
(
iter
==
npu_events_
.
end
())
{
// double free? No such problem has been found so far.
// Or maybe we need a set<Allocation*> to record which
// Allocation managed by GC.
free
(
ptr
);
delete
allocation
;
return
;
}
aclrtEvent
event
=
iter
->
second
;
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
platform
::
NPUEventQuery
(
event
,
&
status
);
if
(
status
==
ACL_EVENT_STATUS_COMPLETE
)
{
free
(
ptr
);
npu_events_
.
erase
(
allocation
);
delete
allocation
;
platform
::
NPUEventDestroy
(
event
);
}
return
;
}
uint64_t
NPUPinnedAllocator
::
ReleaseImpl
(
const
platform
::
Place
&
place
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
// Empty implementation
return
static_cast
<
uint64_t
>
(
0
);
}
void
NPUPinnedAllocator
::
RecordEvent
(
phi
::
Allocation
*
allocation
,
aclrtStream
stream
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
aclrtEvent
event
=
nullptr
;
platform
::
NPUEventCreate
(
&
event
);
platform
::
NPUEventRecord
(
event
,
stream
);
npu_events_
.
insert
({
allocation
,
event
});
}
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
#endif
paddle/fluid/memory/allocation/npu_pinned_allocator.h
已删除
100644 → 0
浏览文件 @
04f8c24e
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include "acl/acl.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
class
NPUPinnedAllocator
:
public
Allocator
{
public:
bool
IsAllocThreadSafe
()
const
override
{
return
true
;
}
void
ProcessEventsAndFree
();
void
RecordEvent
(
phi
::
Allocation
*
allocation
,
aclrtStream
stream
);
constexpr
static
size_t
kAlignment
=
4096UL
;
protected:
phi
::
Allocation
*
AllocateImpl
(
size_t
size
)
override
;
void
FreeImpl
(
phi
::
Allocation
*
allocation
)
override
;
uint64_t
ReleaseImpl
(
const
platform
::
Place
&
place
)
override
;
private:
std
::
unordered_map
<
phi
::
Allocation
*
,
aclrtEvent
>
npu_events_
;
mutable
std
::
mutex
mtx_
;
};
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
#endif
paddle/fluid/memory/allocation/system_allocator.cc
浏览文件 @
0b60f28c
...
@@ -287,135 +287,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
...
@@ -287,135 +287,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void
*
NPUAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
if
(
size
<=
0
)
return
nullptr
;
void
*
p
;
auto
result
=
platform
::
RecordedNPUMalloc
(
&
p
,
size
,
npu_id_
);
if
(
result
==
ACL_ERROR_NONE
)
{
*
index
=
0
;
npu_alloc_size_
+=
size
;
return
p
;
}
else
{
size_t
avail
,
total
,
actual_avail
,
actual_total
;
bool
is_limited
=
platform
::
RecordedNPUMemGetInfo
(
&
avail
,
&
total
,
&
actual_avail
,
&
actual_total
,
npu_id_
);
std
::
string
err_msg
;
if
(
is_limited
)
{
auto
limit_size
=
(
total
>>
20
);
err_msg
=
string
::
Sprintf
(
"
\n
3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.
\n
"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`."
,
limit_size
,
limit_size
);
}
PADDLE_THROW_BAD_ALLOC
(
platform
::
errors
::
ResourceExhausted
(
"
\n\n
Out of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.
\n\n
"
"Please check whether there is any other process using NPU %d.
\n
"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.
\n
"
"2. If no, please try one of the following suggestions:
\n
"
" 1) Decrease the batch size of your model.
\n
"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.
\n
"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s
\n\n
"
,
npu_id_
,
string
::
HumanReadableSize
(
size
),
npu_id_
,
string
::
HumanReadableSize
(
avail
),
npu_id_
,
FLAGS_fraction_of_gpu_memory_to_use
,
err_msg
));
}
}
void
NPUAllocator
::
Free
(
void
*
p
,
size_t
size
,
size_t
index
)
{
VLOG
(
4
)
<<
"Free "
<<
p
<<
" size "
<<
size
;
PADDLE_ENFORCE_EQ
(
index
,
0
,
platform
::
errors
::
InvalidArgument
(
"The index should be 0, index is %d"
,
index
));
PADDLE_ENFORCE_GE
(
npu_alloc_size_
,
size
,
platform
::
errors
::
InvalidArgument
(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)"
,
size
,
npu_alloc_size_
));
npu_alloc_size_
-=
size
;
platform
::
RecordedNPUFree
(
p
,
size
,
npu_id_
);
}
bool
NPUAllocator
::
UseGpu
()
const
{
return
true
;
}
void
*
NPUPinnedAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
if
(
size
<=
0
)
return
nullptr
;
size_t
usable
=
phi
::
backends
::
cpu
::
NPUPinnedMaxAllocSize
()
-
npu_pinnd_alloc_size_
;
if
(
size
>
usable
)
{
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
<<
" MB pinned memory."
<<
", available "
<<
usable
/
1024.0
/
1024.0
<<
" MB"
;
return
nullptr
;
}
void
*
p
;
// PINNED memory is visible to all NPU contexts.
auto
result
=
platform
::
NPUHostMalloc
(
&
p
,
size
);
if
(
result
==
ACL_ERROR_NONE
)
{
*
index
=
1
;
// PINNED memory
npu_pinnd_alloc_size_
+=
size
;
return
p
;
}
else
{
LOG
(
WARNING
)
<<
"NPUHostMalloc failed."
;
return
nullptr
;
}
return
nullptr
;
}
void
NPUPinnedAllocator
::
Free
(
void
*
p
,
size_t
size
,
size_t
index
)
{
aclError
err
;
PADDLE_ENFORCE_EQ
(
index
,
1
,
platform
::
errors
::
InvalidArgument
(
"The index should be 1, but got %d"
,
index
));
PADDLE_ENFORCE_GE
(
npu_pinnd_alloc_size_
,
size
,
platform
::
errors
::
InvalidArgument
(
"The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)"
,
size
,
npu_pinnd_alloc_size_
));
npu_pinnd_alloc_size_
-=
size
;
err
=
platform
::
NPUHostFree
(
p
);
if
(
err
!=
ACL_ERROR_NONE
)
{
PADDLE_ENFORCE_EQ
(
err
,
0
,
platform
::
errors
::
Fatal
(
"NPUHostFree failed in NPUPinnedAllocator, error code is %d"
,
err
));
}
}
bool
NPUPinnedAllocator
::
UseGpu
()
const
{
return
false
;
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
void
*
MLUAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
void
*
MLUAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
if
(
size
<=
0
)
return
nullptr
;
if
(
size
<=
0
)
return
nullptr
;
...
...
paddle/fluid/memory/allocation/system_allocator.h
浏览文件 @
0b60f28c
...
@@ -68,32 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
...
@@ -68,32 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
};
};
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class
NPUAllocator
:
public
SystemAllocator
{
public:
explicit
NPUAllocator
(
int
npu_id
)
:
npu_id_
(
npu_id
)
{}
virtual
void
*
Alloc
(
size_t
*
index
,
size_t
size
);
virtual
void
Free
(
void
*
p
,
size_t
size
,
size_t
index
);
virtual
bool
UseGpu
()
const
;
private:
size_t
npu_alloc_size_
=
0
;
int
npu_id_
;
};
class
NPUPinnedAllocator
:
public
SystemAllocator
{
public:
virtual
void
*
Alloc
(
size_t
*
index
,
size_t
size
);
virtual
void
Free
(
void
*
p
,
size_t
size
,
size_t
index
);
virtual
bool
UseGpu
()
const
;
private:
size_t
npu_pinnd_alloc_size_
=
0
;
};
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
class
MLUAllocator
:
public
SystemAllocator
{
class
MLUAllocator
:
public
SystemAllocator
{
public:
public:
...
...
paddle/fluid/memory/allocation/system_allocator_test.cc
浏览文件 @
0b60f28c
...
@@ -83,14 +83,6 @@ TEST(GPUAllocator, AllocFailure) {
...
@@ -83,14 +83,6 @@ TEST(GPUAllocator, AllocFailure) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST
(
NPUAllocator
,
Alloc
)
{
paddle
::
memory
::
detail
::
NPUAllocator
a
(
0
);
TestAllocator
(
&
a
,
1
<<
20
);
TestAllocator
(
&
a
,
1
);
}
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
TEST
(
MLUAllocator
,
Alloc
)
{
TEST
(
MLUAllocator
,
Alloc
)
{
paddle
::
memory
::
detail
::
MLUAllocator
a
(
0
);
paddle
::
memory
::
detail
::
MLUAllocator
a
(
0
);
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
0b60f28c
...
@@ -260,415 +260,6 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
...
@@ -260,415 +260,6 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template
<
>
void
Copy
<
platform
::
NPUPlace
,
platform
::
CPUPlace
>
(
platform
::
NPUPlace
dst_place
,
void
*
dst
,
platform
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
dst_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:CPU->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
,
reinterpret_cast
<
aclrtStream
>
(
stream
));
}
else
{
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
dst_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:CPU->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
);
}
}
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
NPUPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
src_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:NPU->CPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
,
reinterpret_cast
<
aclrtStream
>
(
stream
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:NPU->CPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
);
}
}
template
<
>
void
Copy
<
platform
::
NPUPlace
,
platform
::
NPUPlace
>
(
platform
::
NPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by stream("
<<
stream
<<
")"
;
if
(
dst_place
==
src_place
)
{
platform
::
SetNPUDeviceId
(
src_place
.
device
);
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync(same_npu):NPU->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_DEVICE
,
reinterpret_cast
<
aclrtStream
>
(
stream
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
dst_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync(same_npu):NPU->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_DEVICE
);
}
}
else
{
if
(
!
platform
::
NPUCanAccessPeer
(
dst_place
.
device
,
dst_place
.
device
))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Peer access between NPU places is not allowed."
));
}
if
(
stream
)
{
// TODO(zhiqiu): support peer access?
platform
::
RecordEvent
record_event
(
"NpuMemcpyPeerAsync:NPU->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_DEVICE
,
reinterpret_cast
<
aclrtStream
>
(
stream
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
dst_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpyPeerSync:NPU->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_DEVICE
);
}
}
}
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
CPUPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
NPUPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
NPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
src_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:NPU->NPUPinned"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
,
reinterpret_cast
<
aclrtStream
>
(
stream
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:NPU->NPUPinned"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
);
}
}
template
<
>
void
Copy
<
platform
::
NPUPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
NPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
dst_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:NPUPinned->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
,
reinterpret_cast
<
aclrtStream
>
(
stream
));
}
else
{
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
dst_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:NPUPinned->NPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
);
}
}
// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
Place
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
platform
::
CPUPlace
place_dst
,
place_src
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPU
)
{
platform
::
NPUPlace
place_dst
(
dst_place
.
GetDeviceId
());
platform
::
CPUPlace
place_src
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
platform
::
NPUPlace
place_src
(
src_place
.
GetDeviceId
());
platform
::
CPUPlace
place_dst
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPU
)
{
platform
::
NPUPlace
place_src
(
src_place
.
GetDeviceId
());
platform
::
NPUPlace
place_dst
(
dst_place
.
GetDeviceId
());
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
)
{
platform
::
CPUPlace
place_src
;
platform
::
NPUPinnedPlace
place_dst
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
platform
::
CPUPlace
place_dst
;
platform
::
NPUPinnedPlace
place_src
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
)
{
platform
::
NPUPinnedPlace
place_dst
;
platform
::
NPUPinnedPlace
place_src
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPU
)
{
platform
::
NPUPinnedPlace
place_src
;
platform
::
NPUPlace
place_dst
(
dst_place
.
GetDeviceId
());
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
)
{
platform
::
NPUPinnedPlace
place_dst
;
platform
::
NPUPlace
place_src
(
src_place
.
GetDeviceId
());
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
)
{
platform
::
CPUPlace
place_src
;
platform
::
CustomPlace
place_dst
(
dst_place
);
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
platform
::
CustomPlace
place_src
(
src_place
);
platform
::
CPUPlace
place_dst
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
)
{
platform
::
CustomPlace
place_src
(
src_place
);
platform
::
CustomPlace
place_dst
(
dst_place
);
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
#endif
}
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
template
<
>
void
Copy
<
phi
::
CPUPlace
,
phi
::
Place
>
(
phi
::
CPUPlace
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
Copy
(
phi
::
Place
(
dst_place
.
GetType
()),
dst
,
src_place
,
src
,
num
,
stream
);
}
// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
CPUPlace
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
Copy
(
dst_place
,
dst
,
phi
::
Place
(
src_place
.
GetType
()),
src
,
num
,
stream
);
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
template
<
>
void
Copy
<
phi
::
NPUPlace
,
phi
::
Place
>
(
phi
::
NPUPlace
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
Copy
(
phi
::
Place
(
dst_place
.
GetType
(),
dst_place
.
GetDeviceId
()),
dst
,
src_place
,
src
,
num
,
stream
);
}
// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
NPUPlace
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
NPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
Copy
(
dst_place
,
dst
,
phi
::
Place
(
src_place
.
GetType
(),
src_place
.
GetDeviceId
()),
src
,
num
,
stream
);
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
template
<
>
void
Copy
<
phi
::
NPUPinnedPlace
,
phi
::
Place
>
(
phi
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
Copy
(
phi
::
Place
(
dst_place
.
GetType
()),
dst
,
src_place
,
src
,
num
,
stream
);
}
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
NPUPinnedPlace
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
Copy
(
dst_place
,
dst
,
phi
::
Place
(
src_place
.
GetType
()),
src
,
num
,
stream
);
}
// NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
template
<
>
void
Copy
<
phi
::
NPUPinnedPlace
,
phi
::
Place
>
(
phi
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
)
{
Copy
(
phi
::
Place
(
dst_place
.
GetType
()),
dst
,
src_place
,
src
,
num
,
nullptr
);
}
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
NPUPinnedPlace
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
Copy
(
dst_place
,
dst
,
phi
::
Place
(
src_place
.
GetType
()),
src
,
num
,
nullptr
);
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
...
@@ -1391,18 +982,6 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
...
@@ -1391,18 +982,6 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
std
::
memcpy
(
dst
,
src
,
num
);
std
::
memcpy
(
dst
,
src
,
num
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
)
{
std
::
memcpy
(
dst
,
src
,
num
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
std
::
memcpy
(
dst
,
src
,
num
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
NPUPINNED
)
{
std
::
memcpy
(
dst
,
src
,
num
);
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
// NOLINT
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
...
@@ -1488,8 +1067,7 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
...
@@ -1488,8 +1067,7 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
}
}
#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) && \
!defined(PADDLE_WITH_HIP)
!defined(PADDLE_WITH_MLU)
template
<
>
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
Place
>
(
phi
::
Place
dst_place
,
void
Copy
<
phi
::
Place
,
phi
::
Place
>
(
phi
::
Place
dst_place
,
...
...
paddle/fluid/operators/coalesce_tensor_op.cc
浏览文件 @
0b60f28c
...
@@ -21,8 +21,7 @@
...
@@ -21,8 +21,7 @@
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/convert_utils.h"
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
...
...
paddle/fluid/operators/copy_cross_scope_test.cc
浏览文件 @
0b60f28c
...
@@ -148,16 +148,4 @@ TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
...
@@ -148,16 +148,4 @@ TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
ctx
.
PartialInitWithAllocator
();
ctx
.
PartialInitWithAllocator
();
Compare2
<
float
>
(
&
scope
,
ctx
,
"copy_cross_scope"
);
Compare2
<
float
>
(
&
scope
,
ctx
,
"copy_cross_scope"
);
}
}
#elif PADDLE_WITH_ASCEND_CL
TEST
(
copy_cross_scope
,
NPU_fp32
)
{
f
::
Scope
scope
;
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
0
));
Compare1
<
float
>
(
&
scope
,
ctx
,
"copy_cross_scope"
);
}
TEST
(
copy_cross_scope_to_main_scope
,
NPU_fp32
)
{
f
::
Scope
scope
;
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
0
));
Compare2
<
float
>
(
&
scope
,
ctx
,
"copy_cross_scope"
);
}
#endif
#endif
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
0b60f28c
...
@@ -28,15 +28,9 @@ function(detection_library TARGET_NAME)
...
@@ -28,15 +28,9 @@ function(detection_library TARGET_NAME)
PARENT_SCOPE
)
PARENT_SCOPE
)
endfunction
()
endfunction
()
if
(
WITH_ASCEND_CL
)
detection_library
(
box_coder_op SRCS box_coder_op.cc
)
detection_library
(
box_coder_op SRCS box_coder_op.cc box_coder_op_npu.cc
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
density_prior_box_op.cu
)
density_prior_box_op.cu density_prior_box_op_npu.cc
)
else
()
detection_library
(
box_coder_op SRCS box_coder_op.cc
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
density_prior_box_op.cu
)
endif
()
if
(
WITH_XPU
)
if
(
WITH_XPU
)
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
...
@@ -49,11 +43,6 @@ elseif(WITH_MLU)
...
@@ -49,11 +43,6 @@ elseif(WITH_MLU)
iou_similarity_op_mlu.cc
)
iou_similarity_op_mlu.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc
)
elseif
(
WITH_ASCEND_CL
)
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_npu.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc
)
else
()
else
()
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu
)
iou_similarity_op.cu
)
...
...
paddle/fluid/operators/expand_op.h
浏览文件 @
0b60f28c
...
@@ -36,13 +36,6 @@ inline std::vector<int> get_expand_times(
...
@@ -36,13 +36,6 @@ inline std::vector<int> get_expand_times(
*
expand_tensor
,
platform
::
CPUPlace
(),
&
cpu_expand_tensor
);
*
expand_tensor
,
platform
::
CPUPlace
(),
&
cpu_expand_tensor
);
expand_data
=
cpu_expand_tensor
.
data
<
int
>
();
expand_data
=
cpu_expand_tensor
.
data
<
int
>
();
}
}
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
expand_tensor
->
place
()))
{
paddle
::
framework
::
TensorCopySync
(
*
expand_tensor
,
platform
::
CPUPlace
(),
&
cpu_expand_tensor
);
expand_data
=
cpu_expand_tensor
.
data
<
int
>
();
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
if
(
platform
::
is_xpu_place
(
expand_tensor
->
place
()))
{
if
(
platform
::
is_xpu_place
(
expand_tensor
->
place
()))
{
paddle
::
framework
::
TensorCopySync
(
paddle
::
framework
::
TensorCopySync
(
...
...
paddle/fluid/operators/expand_v2_op.h
浏览文件 @
0b60f28c
...
@@ -37,13 +37,6 @@ inline std::vector<int> get_expand_shape(
...
@@ -37,13 +37,6 @@ inline std::vector<int> get_expand_shape(
*
shape_tensor
,
platform
::
CPUPlace
(),
&
cpu_shape_tensor
);
*
shape_tensor
,
platform
::
CPUPlace
(),
&
cpu_shape_tensor
);
shape_data
=
cpu_shape_tensor
.
data
<
int
>
();
shape_data
=
cpu_shape_tensor
.
data
<
int
>
();
}
}
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
shape_tensor
->
place
()))
{
paddle
::
framework
::
TensorCopySync
(
*
shape_tensor
,
platform
::
CPUPlace
(),
&
cpu_shape_tensor
);
shape_data
=
cpu_shape_tensor
.
data
<
int
>
();
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
if
(
platform
::
is_xpu_place
(
shape_tensor
->
place
()))
{
if
(
platform
::
is_xpu_place
(
shape_tensor
->
place
()))
{
paddle
::
framework
::
TensorCopySync
(
paddle
::
framework
::
TensorCopySync
(
...
@@ -75,13 +68,6 @@ inline std::vector<int> get_expand_shape(
...
@@ -75,13 +68,6 @@ inline std::vector<int> get_expand_shape(
paddle
::
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
paddle
::
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
vec_epxand_shape
.
push_back
(
*
temp
.
data
<
int32_t
>
());
vec_epxand_shape
.
push_back
(
*
temp
.
data
<
int32_t
>
());
}
}
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
platform
::
is_npu_place
(
tensor
->
place
()))
{
// NOLINT
phi
::
DenseTensor
temp
;
paddle
::
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
vec_epxand_shape
.
push_back
(
*
temp
.
data
<
int32_t
>
());
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
else
if
(
platform
::
is_xpu_place
(
tensor
->
place
()))
{
// NOLINT
else
if
(
platform
::
is_xpu_place
(
tensor
->
place
()))
{
// NOLINT
phi
::
DenseTensor
temp
;
phi
::
DenseTensor
temp
;
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
0b60f28c
if
(
WITH_ASCEND_CL
)
cc_library
(
beam_search_npu
SRCS beam_search_npu.cc
DEPS npu_op_runner
)
endif
()
if
(
WITH_XPU
)
if
(
WITH_XPU
)
cc_library
(
cc_library
(
beam_search_xpu
beam_search_xpu
...
@@ -13,9 +6,7 @@ if(WITH_XPU)
...
@@ -13,9 +6,7 @@ if(WITH_XPU)
endif
()
endif
()
# please add new math_library in alphabetical order
# please add new math_library in alphabetical order
if
(
WITH_ASCEND_CL
)
if
(
WITH_MLU
)
math_library
(
concat_and_split DEPS concat_and_split_functor npu_op_runner
)
elseif
(
WITH_MLU
)
math_library
(
concat_and_split DEPS concat_and_split_functor mlu_baseop
)
math_library
(
concat_and_split DEPS concat_and_split_functor mlu_baseop
)
else
()
else
()
math_library
(
concat_and_split DEPS concat_and_split_functor
)
math_library
(
concat_and_split DEPS concat_and_split_functor
)
...
...
paddle/fluid/operators/memcpy_d2h_op.cc
浏览文件 @
0b60f28c
...
@@ -122,34 +122,6 @@ REGISTER_OPERATOR(
...
@@ -122,34 +122,6 @@ REGISTER_OPERATOR(
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
MemcpyD2HInferShapeFunctor
);
MemcpyD2HInferShapeFunctor
);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR
(
memcpy_d2h
,
float
,
ops
::
MemcpyD2HKernel
,
double
,
ops
::
MemcpyD2HKernel
,
int8_t
,
ops
::
MemcpyD2HKernel
,
uint8_t
,
ops
::
MemcpyD2HKernel
,
int
,
ops
::
MemcpyD2HKernel
,
int64_t
,
ops
::
MemcpyD2HKernel
,
bool
,
ops
::
MemcpyD2HKernel
,
paddle
::
platform
::
bfloat16
,
ops
::
MemcpyD2HKernel
,
paddle
::
platform
::
complex
<
float
>
,
ops
::
MemcpyD2HKernel
,
paddle
::
platform
::
complex
<
double
>
,
ops
::
MemcpyD2HKernel
,
plat
::
float16
,
ops
::
MemcpyD2HKernel
,
int16_t
,
ops
::
MemcpyD2HKernel
);
#endif
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
REGISTER_OP_IPU_KERNEL_FUNCTOR
(
memcpy_d2h
,
REGISTER_OP_IPU_KERNEL_FUNCTOR
(
memcpy_d2h
,
float
,
float
,
...
...
paddle/fluid/operators/norm_op.cc
浏览文件 @
0b60f28c
...
@@ -87,11 +87,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
...
@@ -87,11 +87,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
op
->
SetAttrMap
(
this
->
Attrs
());
op
->
SetAttrMap
(
this
->
Attrs
());
op
->
SetInput
(
"X"
,
this
->
Input
(
"X"
));
op
->
SetInput
(
"X"
,
this
->
Input
(
"X"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
this
->
OutputGrad
(
"Out"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
this
->
OutputGrad
(
"Out"
));
#ifndef PADDLE_WITH_ASCEND_CL
op
->
SetInput
(
"Norm"
,
this
->
Output
(
"Norm"
));
op
->
SetInput
(
"Norm"
,
this
->
Output
(
"Norm"
));
#else
op
->
SetInput
(
"Out"
,
this
->
Output
(
"Out"
));
#endif
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
this
->
InputGrad
(
"X"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
this
->
InputGrad
(
"X"
));
}
}
};
};
...
...
paddle/fluid/platform/device/device_wrapper.h
浏览文件 @
0b60f28c
...
@@ -25,9 +25,6 @@ limitations under the License. */
...
@@ -25,9 +25,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
0b60f28c
...
@@ -248,31 +248,6 @@ void EmplaceDeviceContexts(
...
@@ -248,31 +248,6 @@ void EmplaceDeviceContexts(
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"IPUPlace is not supported. Please "
platform
::
errors
::
Unimplemented
(
"IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."
));
"re-compile with WITH_IPU option."
));
#endif
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext
<
NPUDeviceContext
>
(
place_to_device_context
,
place
,
disable_setting_default_stream_for_allocator
,
/*unused*/
stream_priority
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."
));
#endif
}
else
if
(
platform
::
is_npu_pinned_place
(
place
))
{
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext
<
NPUPinnedDeviceContext
>
(
place_to_device_context
,
place
,
disable_setting_default_stream_for_allocator
,
/*unused*/
stream_priority
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."
));
#endif
#endif
}
}
}
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
0b60f28c
...
@@ -68,8 +68,6 @@ limitations under the License. */
...
@@ -68,8 +68,6 @@ limitations under the License. */
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/phi/backends/device_ext.h"
#include "paddle/phi/backends/device_ext.h"
#include "paddle/phi/backends/stream.h"
#include "paddle/phi/backends/stream.h"
...
@@ -89,10 +87,6 @@ struct GpuDevice;
...
@@ -89,10 +87,6 @@ struct GpuDevice;
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -150,86 +144,6 @@ namespace xpu = baidu::xpu::api;
...
@@ -150,86 +144,6 @@ namespace xpu = baidu::xpu::api;
using
XPUDeviceContext
=
phi
::
XPUContext
;
using
XPUDeviceContext
=
phi
::
XPUContext
;
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class
NPUDeviceContext
:
public
DeviceContext
,
public
phi
::
TypeInfoTraits
<
DeviceContext
,
NPUDeviceContext
>
{
public:
explicit
NPUDeviceContext
(
NPUPlace
place
);
virtual
~
NPUDeviceContext
();
Eigen
::
DefaultDevice
*
eigen_device
()
const
{
return
nullptr
;
}
const
Place
&
GetPlace
()
const
override
;
aclrtContext
context
()
const
;
/*! \brief Wait for all operations completion in the stream. */
void
Wait
()
const
override
;
/*! \brief Return npu stream in the device context. */
aclrtStream
stream
()
const
;
template
<
typename
Callback
>
void
AddStreamCallback
(
Callback
&&
callback
)
const
{
return
stream_
->
AddCallback
(
callback
);
}
void
WaitStreamCallback
()
const
{
return
stream_
->
WaitCallback
();
}
#if defined(PADDLE_WITH_ASCEND_CL)
/*! \brief Return hccl communicators. */
HcclComm
hccl_comm
()
const
{
return
hccl_comm_
;
}
/*! \brief Set hccl communicators. */
void
set_hccl_comm
(
HcclComm
comm
)
{
hccl_comm_
=
comm
;
}
#endif
// template <typename Callback>
// void AddStreamCallback(Callback&& callback) const {
// return stream_->AddCallback(callback);
// }
// void WaitStreamCallback() const { return stream_->WaitCallback(); }
static
const
char
*
name
()
{
return
"NPUDeviceContext"
;
}
private:
NPUPlace
place_
;
aclrtContext
context_
;
#ifdef PADDLE_WITH_ASCEND_CL
// HCCLContext_t hccl_context_;
HcclComm
hccl_comm_
{
nullptr
};
#endif
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in NPU
// NOTE(zhiqiu): why need?
std
::
unique_ptr
<
Eigen
::
DefaultDevice
>
eigen_device_
;
std
::
shared_ptr
<
stream
::
NPUStream
>
stream_
;
DISABLE_COPY_AND_ASSIGN
(
NPUDeviceContext
);
};
// Currently, NPUPinnedDeviceContext is only used to data copying.
class
NPUPinnedDeviceContext
:
public
DeviceContext
,
public
phi
::
TypeInfoTraits
<
DeviceContext
,
NPUPinnedDeviceContext
>
{
public:
NPUPinnedDeviceContext
();
explicit
NPUPinnedDeviceContext
(
NPUPinnedPlace
place
);
const
Place
&
GetPlace
()
const
override
;
Eigen
::
DefaultDevice
*
eigen_device
()
const
;
static
const
char
*
name
()
{
return
"NPUPinnedDeviceContext"
;
}
private:
NPUPinnedPlace
place_
;
std
::
unique_ptr
<
Eigen
::
DefaultDevice
>
eigen_device_
;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
using
CUDAPinnedDeviceContext
=
phi
::
GPUPinnedContext
;
using
CUDAPinnedDeviceContext
=
phi
::
GPUPinnedContext
;
#endif
#endif
...
@@ -264,18 +178,6 @@ template <>
...
@@ -264,18 +178,6 @@ template <>
struct
DefaultDeviceContextType
<
phi
::
MLUPlace
>
;
struct
DefaultDeviceContextType
<
phi
::
MLUPlace
>
;
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template
<
>
struct
DefaultDeviceContextType
<
phi
::
NPUPlace
>
{
using
TYPE
=
paddle
::
platform
::
NPUDeviceContext
;
};
template
<
>
struct
DefaultDeviceContextType
<
phi
::
NPUPinnedPlace
>
{
using
TYPE
=
paddle
::
platform
::
NPUPinnedDeviceContext
;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
>
template
<
>
struct
DefaultDeviceContextType
<
phi
::
GPUPinnedPlace
>
{
struct
DefaultDeviceContextType
<
phi
::
GPUPinnedPlace
>
{
...
...
paddle/fluid/platform/device_event.h
浏览文件 @
0b60f28c
...
@@ -38,12 +38,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA)
...
@@ -38,12 +38,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA)
USE_EVENT_WAIT
(
kCPU
,
kCUDA
)
USE_EVENT_WAIT
(
kCPU
,
kCUDA
)
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
USE_EVENT
(
kNPU
);
USE_EVENT_WAIT
(
kNPU
,
kNPU
)
USE_EVENT_WAIT
(
kCPU
,
kNPU
)
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
USE_EVENT
(
kCUSTOM_DEVICE
);
USE_EVENT
(
kCUSTOM_DEVICE
);
USE_EVENT_WAIT
(
kCUSTOM_DEVICE
,
kCUSTOM_DEVICE
)
USE_EVENT_WAIT
(
kCUSTOM_DEVICE
,
kCUSTOM_DEVICE
)
...
...
paddle/fluid/platform/device_event_npu.cc
已删除
100644 → 0
浏览文件 @
04f8c24e
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device_event_base.h"
#include "paddle/fluid/platform/event.h"
namespace
paddle
{
namespace
platform
{
struct
NPUDeviceEventWrapper
{
explicit
NPUDeviceEventWrapper
(
const
platform
::
Place
&
place
)
{
PADDLE_ENFORCE_EQ
(
platform
::
is_npu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Required device shall be NPUPlace, but received %d. "
,
place
));
device_id_
=
place
.
device
;
PADDLE_ENFORCE_GT
(
device_id_
,
-
1
,
platform
::
errors
::
PreconditionNotMet
(
"Required DeviceOption.device_id > -1, but received %d. "
,
device_id_
));
inner_event_
=
NpuEventResourcePool
::
Instance
().
New
(
device_id_
);
}
std
::
shared_ptr
<
NpuEventObject
>
inner_event_
;
int
device_id_
;
};
void
DeviceEventCreateNPU
(
DeviceEvent
*
event
,
const
platform
::
Place
&
place
,
unsigned
int
)
{
event
->
InitEvent
(
std
::
make_shared
<
NPUDeviceEventWrapper
>
(
place
));
}
void
DeviceEventRecordNPU
(
DeviceEvent
*
event
,
const
DeviceContext
*
context
)
{
auto
*
wrapper
=
static_cast
<
NPUDeviceEventWrapper
*>
(
event
->
GetEvent
().
get
());
auto
*
npu_dev_ctx
=
dynamic_cast
<
const
platform
::
NPUDeviceContext
*>
(
context
);
PADDLE_ENFORCE_NOT_NULL
(
npu_dev_ctx
,
platform
::
errors
::
PreconditionNotMet
(
"Failed to dynamic_cast context into NPUDeviceContext."
));
NPUEventRecord
(
wrapper
->
inner_event_
.
get
(),
npu_dev_ctx
->
stream
());
}
bool
DeviceEventQueryNPU
(
const
DeviceEvent
*
event
)
{
auto
*
wrapper
=
static_cast
<
NPUDeviceEventWrapper
*>
(
event
->
GetEvent
().
get
());
PADDLE_ENFORCE_NOT_NULL
(
wrapper
,
platform
::
errors
::
PreconditionNotMet
(
"Failed to dynamic_cast event into NPUDeviceEventWrapper."
));
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
platform
::
NPUEventQuery
(
wrapper
->
inner_event_
.
get
(),
&
status
);
return
ACL_EVENT_STATUS_COMPLETE
==
status
;
}
void
DeviceEventFinishNPU
(
const
DeviceEvent
*
event
)
{
auto
*
wrapper
=
static_cast
<
NPUDeviceEventWrapper
*>
(
event
->
GetEvent
().
get
());
NPUEventSynchronize
(
wrapper
->
inner_event_
.
get
());
}
void
DeviceEventNPUWaitNPU
(
const
DeviceEvent
*
event
,
const
DeviceContext
*
context
)
{
auto
*
wrapper
=
static_cast
<
NPUDeviceEventWrapper
*>
(
event
->
GetEvent
().
get
());
auto
*
npu_dev_ctx
=
dynamic_cast
<
const
platform
::
NPUDeviceContext
*>
(
context
);
PADDLE_ENFORCE_NOT_NULL
(
npu_dev_ctx
,
platform
::
errors
::
PreconditionNotMet
(
"Failed to dynamic_cast context into NPUDeviceContext."
));
NPUStreamWaitEvent
(
npu_dev_ctx
->
stream
(),
wrapper
->
inner_event_
.
get
());
}
void
DeviceEventCPUWaitNPU
(
const
DeviceEvent
*
event
,
const
DeviceContext
*
context
)
{
DeviceEventFinishNPU
(
event
);
}
void
DeviceEventSetFinishedNPU
(
const
DeviceEvent
*
event
)
{
// do nothing
}
void
EventResetNPU
(
const
DeviceEvent
*
event
)
{
// do nothing
}
}
// namespace platform
}
// namespace paddle
using
::
paddle
::
platform
::
kCPU
;
using
::
paddle
::
platform
::
kNPU
;
REGISTER_EVENT_CREATE_FUNCTION
(
kNPU
,
paddle
::
platform
::
DeviceEventCreateNPU
)
REGISTER_EVENT_RECORD_FUNCTION
(
kNPU
,
paddle
::
platform
::
DeviceEventRecordNPU
)
REGISTER_EVENT_QUERY_FUNCTION
(
kNPU
,
paddle
::
platform
::
DeviceEventQueryNPU
)
REGISTER_EVENT_FINISH_FUNCTION
(
kNPU
,
paddle
::
platform
::
DeviceEventFinishNPU
)
REGISTER_EVENT_SET_FINISHED_FUNCTION
(
kNPU
,
paddle
::
platform
::
DeviceEventSetFinishedNPU
)
REGISTER_EVENT_WAIT_FUNCTION
(
kNPU
,
kNPU
,
paddle
::
platform
::
DeviceEventNPUWaitNPU
)
REGISTER_EVENT_WAIT_FUNCTION
(
kCPU
,
kNPU
,
paddle
::
platform
::
DeviceEventCPUWaitNPU
)
REGISTER_EVENT_RESET_FUNCTION
(
kNPU
,
paddle
::
platform
::
EventResetNPU
)
#endif
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
0b60f28c
...
@@ -54,7 +54,6 @@ void* GetCUDADsoHandle() { return phi::dynload::GetCUDADsoHandle(); }
...
@@ -54,7 +54,6 @@ void* GetCUDADsoHandle() { return phi::dynload::GetCUDADsoHandle(); }
void
*
GetWarpCTCDsoHandle
()
{
return
phi
::
dynload
::
GetWarpCTCDsoHandle
();
}
void
*
GetWarpCTCDsoHandle
()
{
return
phi
::
dynload
::
GetWarpCTCDsoHandle
();
}
void
*
GetNCCLDsoHandle
()
{
return
phi
::
dynload
::
GetNCCLDsoHandle
();
}
void
*
GetNCCLDsoHandle
()
{
return
phi
::
dynload
::
GetNCCLDsoHandle
();
}
void
*
GetHCCLDsoHandle
()
{
return
phi
::
dynload
::
GetHCCLDsoHandle
();
}
void
*
GetTensorRtDsoHandle
()
{
return
phi
::
dynload
::
GetTensorRtDsoHandle
();
}
void
*
GetTensorRtDsoHandle
()
{
return
phi
::
dynload
::
GetTensorRtDsoHandle
();
}
...
...
paddle/fluid/platform/dynload/dynamic_loader.h
浏览文件 @
0b60f28c
...
@@ -37,7 +37,6 @@ void* GetNVRTCDsoHandle();
...
@@ -37,7 +37,6 @@ void* GetNVRTCDsoHandle();
void
*
GetCUDADsoHandle
();
void
*
GetCUDADsoHandle
();
void
*
GetWarpCTCDsoHandle
();
void
*
GetWarpCTCDsoHandle
();
void
*
GetNCCLDsoHandle
();
void
*
GetNCCLDsoHandle
();
void
*
GetHCCLDsoHandle
();
void
*
GetTensorRtDsoHandle
();
void
*
GetTensorRtDsoHandle
();
void
*
GetMKLMLDsoHandle
();
void
*
GetMKLMLDsoHandle
();
void
*
GetLAPACKDsoHandle
();
void
*
GetLAPACKDsoHandle
();
...
...
paddle/fluid/platform/gen_comm_id_helper.cc
浏览文件 @
0b60f28c
...
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h>
#include <arpa/inet.h>
...
...
paddle/fluid/platform/gen_comm_id_helper.h
浏览文件 @
0b60f28c
...
@@ -14,9 +14,8 @@ limitations under the License. */
...
@@ -14,9 +14,8 @@ limitations under the License. */
#pragma once
#pragma once
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
#include <functional>
#include <functional>
#include <memory>
#include <memory>
#include <mutex>
#include <mutex>
...
...
paddle/fluid/platform/init.cc
浏览文件 @
0b60f28c
...
@@ -187,17 +187,6 @@ void InitDevices() {
...
@@ -187,17 +187,6 @@ void InitDevices() {
LOG
(
WARNING
)
<<
"Compiled with WITH_XPU, but no XPU found in runtime."
;
LOG
(
WARNING
)
<<
"Compiled with WITH_XPU, but no XPU found in runtime."
;
}
}
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
platform
::
AclInstance
::
Instance
();
// NOLINT
try
{
// use user specified XPUs in single-node multi-process mode.
devices
=
platform
::
GetSelectedNPUDevices
();
}
catch
(
const
std
::
exception
&
exp
)
{
LOG
(
WARNING
)
<<
"Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found "
"in runtime."
;
}
#endif
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
try
{
try
{
// use user specified IPUs.
// use user specified IPUs.
...
...
paddle/fluid/pybind/ascend_wrapper_py.cc
已删除
100644 → 0
浏览文件 @
04f8c24e
此差异已折叠。
点击以展开。
paddle/fluid/pybind/ascend_wrapper_py.h
已删除
100644 → 0
浏览文件 @
04f8c24e
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace
py
=
pybind11
;
namespace
paddle
{
namespace
pybind
{
void
BindAscendGraph
(
py
::
module
*
m
);
void
BindAscendWrapper
(
py
::
module
*
m
);
void
BindAscendDevice
(
py
::
module
*
m
);
}
// namespace pybind
}
// namespace paddle
#endif
paddle/fluid/pybind/imperative.cc
浏览文件 @
0b60f28c
...
@@ -2616,19 +2616,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -2616,19 +2616,6 @@ void BindImperative(py::module *m_ptr) {
py
::
arg
(
"ring_id"
));
py
::
arg
(
"ring_id"
));
#endif
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
py
::
class_
<
imperative
::
HCCLParallelContext
,
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
HCCLParallelContext
>>
(
m
,
"HCCLParallelContext"
)
.
def
(
py
::
init
<
const
imperative
::
ParallelStrategy
&
,
const
platform
::
NPUPlace
&>
())
.
def
(
"init"
,
[](
imperative
::
HCCLParallelContext
&
self
)
{
self
.
Init
();
})
.
def
(
"init_with_ring_id"
,
&
imperative
::
HCCLParallelContext
::
InitWithRingID
,
py
::
arg
(
"ring_id"
));
#endif
#if defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_CNCL)
py
::
class_
<
imperative
::
CNCLParallelContext
,
py
::
class_
<
imperative
::
CNCLParallelContext
,
imperative
::
ParallelContext
,
imperative
::
ParallelContext
,
...
...
paddle/fluid/pybind/inference_api.cc
浏览文件 @
0b60f28c
...
@@ -772,7 +772,6 @@ void BindAnalysisConfig(py::module *m) {
...
@@ -772,7 +772,6 @@ void BindAnalysisConfig(py::module *m) {
py
::
arg
(
"device_type"
),
py
::
arg
(
"device_type"
),
py
::
arg
(
"device_id"
)
=
0
,
py
::
arg
(
"device_id"
)
=
0
,
py
::
arg
(
"precision"
)
=
AnalysisConfig
::
Precision
::
kFloat32
)
py
::
arg
(
"precision"
)
=
AnalysisConfig
::
Precision
::
kFloat32
)
.
def
(
"enable_npu"
,
&
AnalysisConfig
::
EnableNpu
,
py
::
arg
(
"device_id"
)
=
0
)
.
def
(
"enable_ipu"
,
.
def
(
"enable_ipu"
,
&
AnalysisConfig
::
EnableIpu
,
&
AnalysisConfig
::
EnableIpu
,
py
::
arg
(
"ipu_device_num"
)
=
1
,
py
::
arg
(
"ipu_device_num"
)
=
1
,
...
@@ -1063,13 +1062,7 @@ void BindPaddleInferPredictor(py::module *m) {
...
@@ -1063,13 +1062,7 @@ void BindPaddleInferPredictor(py::module *m) {
.
def
(
"get_output_names"
,
&
paddle_infer
::
Predictor
::
GetOutputNames
)
.
def
(
"get_output_names"
,
&
paddle_infer
::
Predictor
::
GetOutputNames
)
.
def
(
"get_input_handle"
,
&
paddle_infer
::
Predictor
::
GetInputHandle
)
.
def
(
"get_input_handle"
,
&
paddle_infer
::
Predictor
::
GetInputHandle
)
.
def
(
"get_output_handle"
,
&
paddle_infer
::
Predictor
::
GetOutputHandle
)
.
def
(
"get_output_handle"
,
&
paddle_infer
::
Predictor
::
GetOutputHandle
)
.
def
(
"run"
,
.
def
(
"run"
,
[](
paddle_infer
::
Predictor
&
self
)
{
self
.
Run
();
})
[](
paddle_infer
::
Predictor
&
self
)
{
#ifdef PADDLE_WITH_ASCEND_CL
pybind11
::
gil_scoped_release
release
;
#endif
self
.
Run
();
})
.
def
(
"clone"
,
.
def
(
"clone"
,
[](
paddle_infer
::
Predictor
&
self
)
{
return
self
.
Clone
(
nullptr
);
})
[](
paddle_infer
::
Predictor
&
self
)
{
return
self
.
Clone
(
nullptr
);
})
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/pybind/parallel_executor.cc
浏览文件 @
0b60f28c
...
@@ -139,10 +139,6 @@ limitations under the License. */
...
@@ -139,10 +139,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
...
...
paddle/fluid/pybind/place.cc
浏览文件 @
0b60f28c
...
@@ -139,10 +139,6 @@ limitations under the License. */
...
@@ -139,10 +139,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
...
@@ -553,57 +549,14 @@ void BindPlace(pybind11::module &m) { // NOLINT
...
@@ -553,57 +549,14 @@ void BindPlace(pybind11::module &m) { // NOLINT
py
::
class_
<
platform
::
NPUPlace
>
npuplace
(
m
,
"NPUPlace"
,
R"DOC(
py
::
class_
<
platform
::
NPUPlace
>
npuplace
(
m
,
"NPUPlace"
,
R"DOC(
NPUPlace is a descriptor of a device.
NPUPlace is a descriptor of a device.
It represents a NPU device on which a tensor will be allocated and a model will run.
It represents a NPU device on which a tensor will be allocated and a model will run.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
# required: npu
# required: npu
import paddle
import paddle
place = paddle.NPUPlace(0)
place = paddle.NPUPlace(0)
)DOC"
);
)DOC"
);
g_npuplace_pytype
=
reinterpret_cast
<
PyTypeObject
*>
(
npuplace
.
ptr
());
g_npuplace_pytype
=
reinterpret_cast
<
PyTypeObject
*>
(
npuplace
.
ptr
());
npuplace
npuplace
.
def
(
"__init__"
,
[](
platform
::
NPUPlace
&
self
,
int
dev_id
)
{})
.
def
(
"__init__"
,
[](
platform
::
NPUPlace
&
self
,
int
dev_id
)
{
#ifdef PADDLE_WITH_ASCEND_CL
if
(
UNLIKELY
(
dev_id
<
0
))
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Invalid NPUPlace(%d), device id must be 0 or "
"positive integer"
,
dev_id
);
std
::
exit
(
-
1
);
}
if
(
UNLIKELY
(
dev_id
>=
platform
::
GetNPUDeviceCount
()))
{
if
(
platform
::
GetNPUDeviceCount
()
==
0
)
{
LOG
(
ERROR
)
<<
"Cannot use NPU because there is no NPU "
"detected on your "
"machine."
;
std
::
exit
(
-
1
);
}
else
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d"
,
dev_id
,
platform
::
GetNPUDeviceCount
(),
platform
::
GetNPUDeviceCount
());
std
::
exit
(
-
1
);
}
}
new
(
&
self
)
platform
::
NPUPlace
(
dev_id
);
#else
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Cannot use NPU because you have installed CPU/GPU version "
"PaddlePaddle.
\n
"
"If you want to use NPU, please try to install NPU version "
"PaddlePaddle by: pip install paddlepaddle-npu
\n
"
"If you only have CPU, please change NPUPlace(%d) to be "
"CPUPlace().
\n
"
,
dev_id
);
std
::
exit
(
-
1
);
#endif
})
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
NPUPlace
>
)
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
NPUPlace
,
platform
::
Place
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
NPUPlace
,
platform
::
Place
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
NPUPlace
,
platform
::
CUDAPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
NPUPlace
,
platform
::
CUDAPlace
>
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
paddle/fluid/pybind/tensor.cc
浏览文件 @
0b60f28c
...
@@ -139,10 +139,6 @@ limitations under the License. */
...
@@ -139,10 +139,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
paddle/phi/backends/device_memory_aligment.h
浏览文件 @
0b60f28c
...
@@ -19,9 +19,7 @@ limitations under the License. */
...
@@ -19,9 +19,7 @@ limitations under the License. */
#include "paddle/phi/common/place.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/core/errors.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/phi/backends/npu/npu_info.h"
#endif
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
#include "paddle/phi/backends/mlu/mlu_info.h"
#include "paddle/phi/backends/mlu/mlu_info.h"
...
@@ -44,8 +42,6 @@ inline size_t Alignment(size_t size,
...
@@ -44,8 +42,6 @@ inline size_t Alignment(size_t size,
alignment
=
phi
::
backends
::
gpu
::
GpuMinChunkSize
();
alignment
=
phi
::
backends
::
gpu
::
GpuMinChunkSize
();
#elif defined(PADDLE_WITH_XPU)
#elif defined(PADDLE_WITH_XPU)
alignment
=
alignment
;
alignment
=
alignment
;
#elif defined(PADDLE_WITH_ASCEND_CL)
alignment
=
phi
::
backends
::
npu
::
NPUMinChunkSize
();
#elif defined(PADDLE_WITH_MLU)
#elif defined(PADDLE_WITH_MLU)
alignment
=
phi
::
backends
::
mlu
::
MLUMinChunkSize
();
alignment
=
phi
::
backends
::
mlu
::
MLUMinChunkSize
();
#else
#else
...
...
paddle/phi/backends/dynload/CMakeLists.txt
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
paddle/phi/backends/dynload/dynamic_loader.cc
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
paddle/phi/backends/dynload/dynamic_loader.h
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
paddle/phi/backends/npu/npu_info.h
已删除
100644 → 0
浏览文件 @
04f8c24e
此差异已折叠。
点击以展开。
paddle/phi/core/flags.cc
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
paddle/phi/core/utils/visit_place.h
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
paddle/phi/kernels/funcs/interpolate_function.h
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
test/CMakeLists.txt
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
test/amp/CMakeLists.txt
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
test/asp/CMakeLists.txt
浏览文件 @
0b60f28c
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录