未验证 提交 ccf5709d 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] cherry-pick basic NPU components/allocator/operator/executor supports from ascendrc (#32144)

* [feature] support npu allocator (#30840)

[feature] support npu allocator

* [feature] support npu operator (#30951)

[feature] support npu operator

* [feature] support npu allocator, part 2 (#30972)

* support npu allocator

* add npu device context

* fix some compile problem

* fix some compile problem

* add npu info

* compile ok

* fix include dir

* support naive_best_fit_allocator

* run ut ok, bug failed to exit

* call aclrtResetDevice before exit

* fix aclFinilize

* add system allocatot test

* add selected_gpus in gtest

* add tensor_test for npu

* support npu op, initial commit

* add npu stream

* add elementwise_add_op

* compile ok

* fix typo

* fix elementwise_add_op_npu_test

* support op run

* test can run but failed

* change aclopExecuteV2 to aclopCompileAndExecute

* support parsing ascend rank table file (#31000)

support parsing ascend rank table file

* Fix reshape on GE graph. (#31084)

Fix reshape on GE graph

* add npu kernel for elementwise_sub and elementwise_sub_grad (#30973)

* add npu sub op

* fix typo

* rename test

* fix bug

* fix bug

* add fp16 kernel

* fix typo

* support sub grad op

* support elementwise_sub_grad op
Co-authored-by: Nfrankwhzhang <frankwhzhang@126.com>

* Fix compilation problem (#31100)

Fix compilation problem (#31100)

* fix compile

* fix code stype

* remove const_cast

* support adding correct npu op in pybind.h (#31143)

* support adding correct npu op in pybind.h

* refine code

* [NPU] Support executor with NPU (#31057)

* [NPU] Support executor with NPU

* Fix code according to reviews

* Fix code

* Add unittest for sub op npu

* refactor npu device manager (#31154)

refactor npu device manager (#31154)

* fix selected npus

* fix compile

* fix reading flags from env

* format
Co-authored-by: Nxiayanming <41795079@qq.com>
Co-authored-by: Ngongweibao <weibao.gong@gmail.com>
Co-authored-by: Nfrankwhzhang <frankwhzhang@126.com>
Co-authored-by: Nliym27 <33742067+liym27@users.noreply.github.com>
上级 a73cb679
......@@ -33,11 +33,14 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
if (WITH_GPU AND WITH_ASCEND)
if (WITH_GPU AND WITH_ASCEND)
message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
endif()
......
......@@ -82,6 +82,10 @@ if(WITH_ASCEND)
add_definitions(-DPADDLE_WITH_ASCEND)
endif()
if(WITH_ASCEND_CL)
add_definitions(-DPADDLE_WITH_ASCEND_CL)
endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)
......
......@@ -21,38 +21,60 @@ else()
set(ASCEND_DIR /usr/local/Ascend)
endif()
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
if(WITH_ASCEND)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
endif()
......@@ -201,6 +201,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
else()
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
......
......@@ -11,6 +11,7 @@ function(op_library TARGET)
set(cu_cc_srcs)
set(hip_cc_srcs)
set(xpu_cc_srcs)
set(npu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(miopen_cu_cc_srcs)
set(cudnn_cu_srcs)
......@@ -20,6 +21,9 @@ function(op_library TARGET)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
if (WITH_ASCEND_CL)
set(op_common_deps ${op_common_deps} npu_op_runner)
endif()
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
set(options UNITY)
set(oneValueArgs "")
......@@ -85,6 +89,12 @@ function(op_library TARGET)
list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif()
endif()
if(WITH_ASCEND_CL)
string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
......@@ -107,6 +117,8 @@ function(op_library TARGET)
list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
list(APPEND npu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
......@@ -176,7 +188,7 @@ function(op_library TARGET)
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
if(WITH_UNITY_BUILD AND op_library_UNITY)
# Combine the cc source files.
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
if(TARGET ${UNITY_TARGET})
# If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
......@@ -187,7 +199,7 @@ function(op_library TARGET)
# Add alias library to handle dependencies.
add_library(${TARGET} ALIAS ${UNITY_TARGET})
else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
endif()
......@@ -207,6 +219,7 @@ function(op_library TARGET)
# The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
set(ORIGINAL_TARGET ${TARGET})
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
# [ \t\r\n]* is used for blank characters
......@@ -239,8 +252,9 @@ function(op_library TARGET)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
list(LENGTH npu_cc_srcs npu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
......@@ -280,6 +294,26 @@ function(op_library TARGET)
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()
if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
# It is different from the logic above, becareful
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
# [ \t\r\n]* is used for blank characters
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
if (one_npu_register STREQUAL "")
string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
# [ \t\r\n]+ is used for blank characters.
# Here we use '+' instead of '*' since it is a REPLACE operation.
string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
endif()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
......@@ -330,6 +364,7 @@ function(register_operators)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE "_npu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len)
......
......@@ -274,10 +274,15 @@ if(WITH_BOX_PS)
list(APPEND third_party_deps extern_box_ps)
endif(WITH_BOX_PS)
if(WITH_ASCEND)
if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend)
list(APPEND third_party_deps extern_ascend)
endif (WITH_ASCEND)
if(WITH_ASCEND)
list(APPEND third_party_deps extern_ascend)
endif()
if(WITH_ASCEND_CL)
list(APPEND third_party_deps extern_ascend_cl)
endif()
endif ()
if (WITH_PSCORE)
include(external/snappy)
......
......@@ -82,6 +82,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::NPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::NPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLContext ctx;
......
......@@ -453,6 +453,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(ascendrc): Support garbage collector on NPUPlace
VLOG(4) << "Skip NPU gc because it is not implemented now.";
#else
PADDLE_THROW(platform::errors::Unimplemented(
"No NPU gc found in CPU/GPU/XPU paddle"));
#endif
}
}
......
......@@ -86,8 +86,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
callback_manager_.reset(
new platform::StreamCallbackManager<gpuStream_t>(stream_));
#endif
callback_manager_.reset(new platform::StreamCallbackManager(stream_));
}
StreamGarbageCollector::~StreamGarbageCollector() {
......
......@@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector {
private:
gpuStream_t stream_;
std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
std::unique_ptr<platform::StreamCallbackManager<gpuStream_t>>
callback_manager_;
};
class CUDAPinnedGarbageCollector : public GarbageCollector {
......
......@@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
return LibraryType::kPlain;
} else if (s == std::string("XPU")) {
return LibraryType::kPlain;
} else if (s == std::string("NPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) {
return LibraryType::kPlain;
} else {
......
......@@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \
customized_type_value, \
......
......@@ -208,6 +208,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_npu_place(place)) {
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with NPU support.",
place));
#else
auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
platform::SetNPUDeviceId(dev_id);
#endif
}
......@@ -1248,6 +1258,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound(
......
......@@ -625,6 +625,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy,
ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
platform::errors::Unavailable(
"NPU is not supported in ParallelExecutor"));
InitP2P(places);
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size());
......
......@@ -125,25 +125,54 @@ TEST(Tensor, MutableData) {
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::CUDAPlace());
platform::CUDAPlace(0));
auto p1_holder = src_tensor.Holder();
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
platform::CUDAPlace());
platform::CUDAPlace(0));
auto p2_holder = src_tensor.Holder();
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1_holder.get(), p2_holder.get());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::CUDAPlace());
platform::CUDAPlace(0));
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::CUDAPlace());
platform::CUDAPlace(0));
EXPECT_EQ(p1, p2);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
framework::Tensor src_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::NPUPlace(0));
auto p1_holder = src_tensor.Holder();
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
platform::NPUPlace(0));
auto p2_holder = src_tensor.Holder();
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1_holder.get(), p2_holder.get());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::NPUPlace(0));
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::NPUPlace(0));
EXPECT_EQ(p1, p2);
}
#endif
......@@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) {
framework::Tensor src_tensor;
framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::CUDAPlace());
platform::CUDAPlace(0));
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
framework::Tensor src_tensor;
framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::NPUPlace(0));
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
......@@ -216,7 +255,34 @@ TEST(Tensor, Slice) {
{
framework::Tensor src_tensor;
src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
platform::CUDAPlace());
platform::CUDAPlace(0));
framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4);
EXPECT_EQ(slice_dims[1], 9);
uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address =
reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
src_tensor.dims(), platform::CUDAPlace(0)));
uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
slice_tensor.dims(), platform::CUDAPlace(0)));
EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
framework::Tensor src_tensor;
src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
platform::NPUPlace(0));
framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
......@@ -227,12 +293,12 @@ TEST(Tensor, Slice) {
reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address =
reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
src_tensor.dims(), platform::CUDAPlace()));
src_tensor.dims(), platform::NPUPlace(0)));
uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
slice_tensor.dims(), platform::CUDAPlace()));
slice_tensor.dims(), platform::NPUPlace(0)));
EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
......
......@@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(zhiqiu): handle different condition like CUDA code below
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
stream);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
......@@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { /* npu -> cpu*/
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* cpu -> npu*/
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
nullptr);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* npu -> npu*/
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
nullptr);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
......@@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
return GetResultHelper(out, gpu);
}
bool GetResult(const framework::Tensor& out,
const platform::NPUPlace& npu) const {
PADDLE_THROW(
platform::errors::Unimplemented("Not supported on place (%s) ", npu));
// return GetResultHelper(out, npu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
......@@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
#endif
}
void VisitorImpl(const platform::NPUPlace& npu) const {
// TODO(zhiqiu)
}
void VisitorImpl(const platform::CPUPlace& cpu) const {
int num = in_.numel();
const bool* in_ptr = in_.data<bool>();
......
......@@ -157,6 +157,14 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
}
template <typename T>
......@@ -194,6 +202,14 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
}
template <typename T>
......
......@@ -115,6 +115,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void operator()(const platform::NPUPlace& place) {
// TODO(zhiqiu): SUPPORT it
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const platform::NPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
......
......@@ -27,6 +27,10 @@ if (WITH_ROCM)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
endif()
if (WITH_ASCEND_CL)
cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
if (WITH_GPU OR WITH_ROCM)
......
......@@ -32,6 +32,7 @@
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
#include "paddle/fluid/platform/npu_info.h"
DEFINE_int64(
gpu_allocator_retry_time, 10000,
......@@ -66,6 +67,11 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_ASCEND_CL
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
}
#endif
break;
}
......@@ -185,6 +191,12 @@ class AllocatorFacadePrivate {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
class ZeroSizeAllocator : public Allocator {
public:
explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
......
......@@ -19,7 +19,10 @@
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
......@@ -110,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
// For kunlun XPU
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_XPU
......@@ -219,6 +223,135 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif
}
// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
class NPUBuddyAllocatorList {
private:
NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
auto npu_num = devices_.size();
allocators_.resize(npu_num);
init_flags_.reserve(npu_num);
for (size_t i = 0; i < npu_num; ++i) {
init_flags_.emplace_back(new std::once_flag());
}
}
static NPUBuddyAllocatorList *CreateNewInstance() {
return new NPUBuddyAllocatorList();
}
public:
static NPUBuddyAllocatorList *Instance() {
static auto *instance = CreateNewInstance();
return instance;
}
BuddyAllocator *Get(int npu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for GPU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[pos].get();
}
private:
std::vector<int> devices_;
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
return NPUBuddyAllocatorList::Instance()->Get(npu_id);
}
#endif
template <>
size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::NPUDeviceGuard(place.device);
size_t avail, total;
platform::NPUMemoryUsage(&avail, &total);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
"%s, GpuMaxChunkSize %s, GPU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place))));
} else {
if (FLAGS_init_allocated_mem) {
aclrtMemset(ptr, size, 0xEF, size);
}
}
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetNPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
// For CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class GPUBuddyAllocatorList {
private:
......
......@@ -61,6 +61,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
{
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
}
sleep(10);
alloc.Release(platform::NPUPlace(0));
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
alloc.Release(platform::NPUPlace(0));
}
#endif
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/npu_allocator.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace memory {
namespace allocation {
bool NPUAllocator::IsAllocThreadSafe() const { return true; }
void NPUAllocator::FreeImpl(Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
platform::errors::PermissionDenied(
"NPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
place_.device);
delete allocation;
}
Allocation* NPUAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_,
[this] { platform::SetNPUDeviceId(place_.device); });
void* ptr;
auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
if (LIKELY(result == ACL_ERROR_NONE)) {
return new Allocation(ptr, size, platform::Place(place_));
}
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, place_.device);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
"GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(avail), place_.device, err_msg));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class NPUAllocator : public Allocator {
public:
explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size) override;
private:
platform::NPUPlace place_;
std::once_flag once_flag_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -6,6 +6,8 @@ if(WITH_GPU)
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
elseif(${WITH_ASCEND_CL})
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
else()
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
endif()
......
......@@ -21,6 +21,9 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
namespace paddle {
namespace memory {
......@@ -235,6 +238,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
} else {
// Compute the re-allocation size, we store the re-allocation size when
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = platform::NPUReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
// Allocate a new block
void* p = system_allocator_->Alloc(&index, allocate_bytes);
......
......@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace memory {
......
......@@ -19,14 +19,16 @@ limitations under the License. */
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
#include <fstream>
#include <string>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <fstream>
#include <string>
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
......@@ -342,6 +344,32 @@ TEST(BuddyAllocator, Release) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(BuddyAllocator, NpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.005;
FLAGS_fraction_of_gpu_memory_to_use = 0.92;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
buddy_allocator.Release();
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator, 300 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -29,6 +29,8 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -247,6 +249,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void* NPUAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
if (result == ACL_ERROR_NONE) {
*index = 0;
npu_alloc_size_ += size;
return p;
} else {
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, npu_id_);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please try one of the following suggestions:\n"
" 1) Decrease the batch size of your model.\n"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_, string::HumanReadableSize(size), npu_id_,
string::HumanReadableSize(avail), npu_id_,
FLAGS_fraction_of_gpu_memory_to_use, err_msg));
}
}
void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_, size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, npu_alloc_size_));
npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_);
}
bool NPUAllocator::UseGpu() const { return true; }
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUAllocator : public SystemAllocator {
public:
explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t npu_alloc_size_ = 0;
int npu_id_;
};
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -85,3 +85,11 @@ TEST(GPUAllocator, AllocFailure) {
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NPUAllocator, Alloc) {
paddle::memory::detail::NPUAllocator a(0);
TestAllocator(&a, 1 << 20);
TestAllocator(&a, 1);
}
#endif
......@@ -196,6 +196,85 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template <>
void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
aclrtStream stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
} else {
platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
}
}
template <>
void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src, size_t num,
aclrtStream stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
}
}
template <>
void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src, size_t num,
aclrtStream stream) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by stream(" << stream << ")";
if (dst_place == src_place) {
platform::SetNPUDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
} else {
if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) {
PADDLE_THROW(platform::errors::Unavailable(
"Peer access between NPU places is not allowed."));
}
if (stream) {
// TODO(zhiqiu): support peer access?
platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
}
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
......
......@@ -52,7 +52,27 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
gpuStream_t stream);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
/**
* \brief Copy memory from one place to another place.
*
* \param[in] DstPlace Destination allocation place (CPU or NPU).
* \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or NPU).
* \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy.
* \param[in] stream NPU stream.
*
* \note For NPU memory copy, NPU stream need to be specified
* for asynchronously memory copy.
*
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
aclrtStream stream);
#endif
} // namespace memory
} // namespace paddle
......@@ -123,6 +123,11 @@ if (WITH_ASCEND)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper)
endif()
if (WITH_ASCEND_CL)
cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
endif()
# FIXME(typhoonzero): operator deps may not needed.
# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
# op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
......
......@@ -8,3 +8,7 @@ register_operators(DEPS op_version_registry)
cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
if(WITH_ASCEND_CL)
cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor)
endif()
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
elementwise_add,
ops::ElementwiseAddNPUKernel<paddle::platform::NPUDeviceContext, float>);
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
USE_OP(elementwise_sub);
USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx,
std::string op_type) {
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<f::LoDTensor>();
std::vector<T> init_x;
for (int64_t i = 0; i < 10 * 10; ++i) {
init_x.push_back(static_cast<T>(1.0));
}
std::vector<T> init_y;
for (int64_t i = 0; i < 10 * 10; ++i) {
init_y.push_back(static_cast<T>(2.0));
}
TensorFromVector(init_x, ctx, tensor_x);
tensor_x->Resize({10, 10});
TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
float expected;
if (op_type == "elementwise_add") {
expected = 3.0;
} else if (op_type == "elementwise_sub") {
expected = -1.0;
}
EXPECT_EQ(out_vec.size(), init_x.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], static_cast<T>(expected));
}
}
template <typename T>
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
std::string op_type) {
// init
auto dout = scope->Var("DOut");
auto tensor_dout = dout->GetMutable<f::LoDTensor>();
tensor_dout->Resize({2, 3, 5});
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
tensor_x->Resize({2, 3, 5});
auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<f::LoDTensor>();
tensor_y->Resize({1, 5});
auto dx = scope->Var("DX");
auto tensor_dx = dx->GetMutable<f::LoDTensor>();
auto dy = scope->Var("DY");
auto tensor_dy = dy->GetMutable<f::LoDTensor>();
std::vector<T> init_dout;
for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
init_dout.push_back(static_cast<T>(1.0));
}
TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({2, 3, 5});
ctx.Wait();
// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(
op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
auto place = ctx.GetPlace();
op->Run(*scope, place);
std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);
std::vector<T> dy_vec;
TensorToVector(*tensor_dy, ctx, &dy_vec);
ctx.Wait();
float expected_x, expected_y;
if (op_type == "elementwise_add_grad") {
expected_x = 1.0;
expected_y = 6.0;
} else if (op_type == "elementwise_sub_grad") {
expected_x = 1.0;
expected_y = -6.0;
}
for (uint32_t i = 0; i < dx_vec.size(); i++) {
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
}
for (uint32_t i = 0; i < dy_vec.size(); i++) {
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
}
}
TEST(elementwise_add, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_add");
}
TEST(elementwise_sub, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_sub");
}
TEST(elementwise_sub, NPU_fp16) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx, "elementwise_sub");
}
TEST(elementwise_sub_grad, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
template <typename DeviceContext, typename T>
class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
dx->mutable_data<T>(ctx.GetPlace());
dy->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
// default axis=-1?
// So, the sub_grad should do reduce if needed.
// For example, the shape of each variable in elementwise_sub:
// x, dx: [2, 3, 5]
// y, dy: [1, 5]
// out, dout: [2, 3, 5]
// Then, out = x - y => dx = dout, dy = -dout
// And, the shape of dy can be computed by two stages reduce,
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// For dx
// stage 1
auto reduce_ndim = dout->dims().size() - dx->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
auto tmp_dout = dout;
Tensor reduced_dout(dx->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
for (auto i = 0; i < dx->dims().size(); ++i) {
if (dx->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
}
// For dy
// stage 1
reduce_ndim = dout->dims().size() - dy->dims().size();
axes.clear();
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
tmp_dout = dout;
Tensor reduced_dy(dy->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
auto* tmp_dy = tmp_dout;
for (auto i = 0; i < dy->dims().size(); ++i) {
if (dy->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
reduced_dy.Resize(dy->dims());
reduced_dy.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
tmp_dy = &reduced_dy;
}
// stage 3, negative
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
elementwise_sub,
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
elementwise_sub_grad,
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
......@@ -149,6 +149,13 @@ void set_constant_with_place<platform::XPUPlace>(
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/npu_op_runner.h"
#include <paddle/fluid/framework/data_type.h>
#include <paddle/fluid/framework/operator.h>
#include <map>
#include <string>
#include <vector>
#include "acl/acl.h"
#include "acl/acl_op_compiler.h"
#include "paddle/fluid/framework/framework.pb.h"
namespace paddle {
namespace operators {
static std::map<framework::proto::VarType::Type, aclDataType>
DTYPE_2_ACL_DTYPE = {
{framework::proto::VarType::BOOL, ACL_BOOL},
{framework::proto::VarType::INT16, ACL_INT16},
{framework::proto::VarType::INT32, ACL_INT32},
{framework::proto::VarType::INT64, ACL_INT64},
{framework::proto::VarType::FP16, ACL_FLOAT16},
{framework::proto::VarType::FP32, ACL_FLOAT},
{framework::proto::VarType::FP64, ACL_DOUBLE},
};
static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
{DataLayout::kNCHW, ACL_FORMAT_NCHW},
{DataLayout::kNHWC, ACL_FORMAT_NHWC},
{DataLayout::kAnyLayout, ACL_FORMAT_ND},
};
aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
auto iter = DTYPE_2_ACL_DTYPE.find(dtype);
PADDLE_ENFORCE_NE(iter, DTYPE_2_ACL_DTYPE.end(),
platform::errors::NotFound(
"The data type (%s) can not convert to ACL data type.",
framework::DataTypeToString(dtype)));
return iter->second;
}
aclFormat ConvertToNpuFormat(DataLayout layout) {
auto iter = DATA_LAYOUT_2_ACL_FORMAT.find(layout);
PADDLE_ENFORCE_NE(
iter, DATA_LAYOUT_2_ACL_FORMAT.end(),
platform::errors::NotFound(
"The data type (%s) can not convert to ACL data type.", layout));
return iter->second;
}
NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {
attr_ = aclopCreateAttr();
}
NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
const std::vector<Tensor> &outputs,
const AttributeMap &attrs)
: op_type_(op_type) {
attr_ = aclopCreateAttr();
AddInputs(inputs);
AddOutputs(outputs);
AddAttrs(attrs);
}
NpuOpRunner::~NpuOpRunner() {
// TODO(zhiqiu): handle free
}
const std::string &NpuOpRunner::Type() { return op_type_; }
NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
const Attribute &attr) {
if (attr.type() == typeid(bool)) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr)));
} else if (attr.type() == typeid(int)) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr)));
} else if (attr.type() == typeid(int64_t)) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
} else if (attr.type() == typeid(float)) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr)));
} else if (attr.type() == typeid(std::vector<bool>)) {
auto a = BOOST_GET_CONST(std::vector<bool>, attr);
std::vector<uint8_t> cast_a;
for (auto it : a) {
cast_a.push_back(static_cast<uint8_t>(it));
}
PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool(
attr_, name.c_str(), cast_a.size(), cast_a.data()));
} else if (attr.type() == typeid(std::vector<int>)) {
auto a = BOOST_GET_CONST(std::vector<int>, attr);
std::vector<int64_t> cast_a;
for (auto it : a) {
cast_a.push_back(static_cast<int64_t>(it));
}
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrListInt(attr_, name.c_str(), cast_a.size(), cast_a.data()));
} else if (attr.type() == typeid(std::vector<int64_t>)) {
auto a = BOOST_GET_CONST(std::vector<int64_t>, attr);
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrListInt(attr_, name.c_str(), a.size(), a.data()));
} else if (attr.type() == typeid(std::vector<float>)) {
auto a = BOOST_GET_CONST(std::vector<float>, attr);
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrListFloat(attr_, name.c_str(), a.size(), a.data()));
} else if (attr.type() == typeid(std::string)) {
auto a = BOOST_GET_CONST(std::string, attr);
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrString(attr_, name.c_str(), a.c_str()));
} else if (attr.type() == typeid(std::vector<std::string>)) {
auto a = BOOST_GET_CONST(std::vector<std::string>, attr);
std::vector<const char *> s;
for (auto &it : a) {
s.push_back(it.data());
}
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrListString(attr_, name.c_str(), s.size(), s.data()));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Can not convert attribubte '%s' to convert to aclopAttr", name));
}
return *this;
}
NpuOpRunner &NpuOpRunner::AddAttrs(const AttributeMap &attrs) {
for (const auto &pair : attrs) {
AddAttr(pair.first, pair.second);
}
return *this;
}
NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) {
// create aclTensorDesc
input_descs_.emplace_back(CreateTensorDesc(tensor));
// create aclDataBuffer
input_buffers_.emplace_back(CreateDataBuffer(tensor));
return *this;
}
NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
// create aclTensorDesc
output_descs_.emplace_back(CreateTensorDesc(tensor));
// create aclDataBuffer
output_buffers_.emplace_back(CreateDataBuffer(tensor));
return *this;
}
NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
for (auto tensor : tensors) {
// create aclTensorDesc
input_descs_.emplace_back(CreateTensorDesc(tensor));
// create aclDataBuffer
input_buffers_.emplace_back(CreateDataBuffer(tensor));
}
return *this;
}
NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
for (auto tensor : tensors) {
// create aclTensorDesc
output_descs_.emplace_back(CreateTensorDesc(tensor));
// create aclDataBuffer
output_buffers_.emplace_back(CreateDataBuffer(tensor));
}
return *this;
}
aclTensorDesc *NpuOpRunner::GetInputDesc(size_t index) {
PADDLE_ENFORCE_LT(index, input_descs_.size(),
platform::errors::OutOfRange(
"The index should be less than the size of inputs of "
"operator %s, but got index is %d and size is %d",
Type(), index, input_descs_.size()));
return input_descs_[index];
}
aclTensorDesc *NpuOpRunner::GetOutputDesc(size_t index) {
PADDLE_ENFORCE_LT(index, output_descs_.size(),
platform::errors::OutOfRange(
"The index should be less than the size of output of "
"operator %s, but got index is %d and size is %d",
Type(), index, output_descs_.size()));
return output_descs_[index];
}
std::vector<aclTensorDesc *> &NpuOpRunner::GetInputDescs() {
return input_descs_;
}
std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() {
return output_descs_;
}
std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() {
return input_buffers_;
}
std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
return output_buffers_;
}
aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
auto dtype = ConvertToNpuDtype(tensor.type());
auto format = ConvertToNpuFormat(tensor.layout());
auto dims = framework::vectorize(tensor.dims());
VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1]
<< " " << format;
auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format);
PADDLE_ENFORCE_NOT_NULL(
desc, platform::errors::External("Call aclCreateTensorDesc failed."));
return desc;
}
aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
void *ptr = tensor.data<void>();
VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size();
auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
PADDLE_ENFORCE_NOT_NULL(
buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
return buffer;
}
void NpuOpRunner::Run(aclrtStream stream) {
VLOG(4) << "op_type: " << op_type_;
VLOG(4) << "input_desc.size: " << input_descs_.size();
VLOG(4) << "output_desc.size: " << output_descs_.size();
VLOG(4) << "stream: " << stream;
VLOG(4) << "attr: " << attr_;
aclError ret = aclopCompileAndExecute(
op_type_.c_str(), input_descs_.size(), input_descs_.data(),
input_buffers_.data(), output_descs_.size(), output_descs_.data(),
output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
stream);
VLOG(4) << "after aclopCompileAndExecute: " << ret;
PADDLE_ENFORCE_NPU_SUCCESS(ret);
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <paddle/fluid/framework/operator.h>
#include <string>
#include <vector>
#include "acl/acl.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout;
using Attribute = framework::Attribute;
using AttributeMap = framework::AttributeMap;
class NpuOpRunner {
public:
explicit NpuOpRunner(std::string op_type);
explicit NpuOpRunner(std::string op_type,
const std::vector<Tensor> &inputs = {},
const std::vector<Tensor> &outputs = {},
const AttributeMap &attrs = {});
~NpuOpRunner();
const std::string &Type();
NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr);
NpuOpRunner &AddAttrs(const AttributeMap &attrs);
NpuOpRunner &AddInput(const Tensor &tensor);
NpuOpRunner &AddOutput(const Tensor &tensor);
NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
NpuOpRunner &AddOutputs(const std::vector<Tensor> &tensors);
aclTensorDesc *GetInputDesc(size_t index);
aclTensorDesc *GetOutputDesc(size_t index);
std::vector<aclTensorDesc *> &GetInputDescs();
std::vector<aclTensorDesc *> &GetOutputDescs();
std::vector<aclDataBuffer *> &GetInputBuffers();
std::vector<aclDataBuffer *> &GetOutputBuffers();
void Run(aclrtStream stream);
private:
aclTensorDesc *CreateTensorDesc(Tensor tensor);
aclDataBuffer *CreateDataBuffer(Tensor tensor);
private:
std::string op_type_;
std::vector<aclDataBuffer *> input_buffers_;
std::vector<aclDataBuffer *> output_buffers_;
std::vector<aclTensorDesc *> input_descs_;
std::vector<aclTensorDesc *> output_descs_;
aclopAttr *attr_{nullptr};
};
} // namespace operators
} // namespace paddle
......@@ -76,6 +76,10 @@ if(WITH_ASCEND)
cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
endif()
if(WITH_ASCEND_CL)
cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor ascendcl acl_op_compiler)
endif()
add_subdirectory(dynload)
add_subdirectory(stream)
......@@ -91,11 +95,20 @@ IF(WITH_GPU OR WITH_ROCM)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()
IF(WITH_ASCEND_CL)
set(NPU_CTX_DEPS npu_stream npu_info)
ENDIF()
IF(WITH_MKLDNN)
set(MKLDNN_CTX_DEPS mkldnn)
ELSE()
set(MKLDNN_CTX_DEPS)
ENDIF()
IF(WITH_ASCEND_CL)
cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
IF(WITH_GPU)
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
......@@ -105,6 +118,8 @@ ENDIF()
IF(WITH_GPU OR WITH_ROCM)
set(STREAM_CALLBACK_DEPS stream_callback_manager)
ELSEIF(WITH_ASCEND_CL)
set(STREAM_CALLBACK_DEPS stream_callback_manager)
ELSE()
set(STREAM_CALLBACK_DEPS)
ENDIF()
......@@ -118,7 +133,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
......
......@@ -78,13 +78,13 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
DeviceContextPool* DeviceContextPool::pool = nullptr;
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
VLOG(4) << "DeviceContextPool Get: " << place;
auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) {
PADDLE_THROW(platform::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU or WITH_XPU option or check that your train process "
"hold the "
"correct gpu_id if you use Executor.",
"with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
"your train process set the correct device id if you use Executor.",
place));
}
return it->second.get().get();
......@@ -145,6 +145,14 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW(
platform::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (platform::is_npu_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUDeviceContext, NPUPlace>(&device_contexts_, p);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."));
#endif
}
}
......@@ -229,8 +237,35 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
xpu::Context* XPUDeviceContext::x_context() const { return context_; }
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
// NOTE(zhiqiu): Usually, no need to create context explicitly,
// ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice.
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
stream_.reset(new stream::NPUStream(place));
}
NPUDeviceContext::~NPUDeviceContext() {
// NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
void NPUDeviceContext::Wait() const {
NPUDeviceGuard guard(place_.device);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; }
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class EigenCudaStreamDevice : public Eigen::StreamInterface {
public:
EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
......@@ -706,6 +741,5 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
}
#endif
} // namespace platform
} // namespace paddle
......@@ -57,6 +57,9 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/stream/cuda_stream.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/stream/npu_stream.h"
#endif
#include "unsupported/Eigen/CXX11/Tensor"
namespace Eigen {
......@@ -69,6 +72,11 @@ struct GpuDevice;
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h"
#include "paddle/fluid/platform/npu_info.h"
#endif
namespace paddle {
namespace platform {
......@@ -87,11 +95,13 @@ enum DeviceType {
CPU = 0,
CUDA = 1,
XPU = 2,
NPU = 3,
};
constexpr DeviceType kCPU = DeviceType::CPU;
constexpr DeviceType kCUDA = DeviceType::CUDA;
constexpr DeviceType kXPU = DeviceType::XPU;
constexpr DeviceType kNPU = DeviceType::NPU;
class DeviceContext {
public:
......@@ -163,8 +173,52 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_ASCEND_CL
class NPUDeviceContext : public DeviceContext {
public:
explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
aclrtContext context() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Return npu stream in the device context. */
aclrtStream stream() const;
#ifdef PADDLE_WITH_ASCEND_HCCL
/*! \brief Return bkcl context. */
HCCLContext_t hccl_context() const { return hccl_context_; }
/*! \brief Set bkcl context. */
void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
#endif
private:
NPUPlace place_;
aclrtContext context_;
#ifdef PADDLE_WITH_ASCEND_HCCL
HCCLContext_t hccl_context_;
#endif
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in NPU
// NOTE(zhiqiu): why need?
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
std::shared_ptr<stream::NPUStream> stream_;
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
};
template <>
struct DefaultDeviceContextType<platform::NPUPlace> {
using TYPE = NPUDeviceContext;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class CudnnWorkspaceHandle;
class EigenCudaStreamDevice;
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
#include <glog/logging.h>
#include <mutex> // NOLINT
......@@ -186,3 +187,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
......@@ -45,6 +45,10 @@ limitations under the License. */
#include <thrust/system_error.h> // NOLINT
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h"
#endif // PADDLE_WITH_ASCEND_CL
#include <fstream>
#include <iomanip>
#include <memory>
......@@ -970,7 +974,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
#endif
} // namespace details
#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \
......@@ -1204,5 +1207,41 @@ inline void retry_sleep(unsigned millisecond) {
#undef DEFINE_CUDA_STATUS_TYPE
#endif // PADDLE_WITH_HIP
#ifdef PADDLE_WITH_ASCEND_CL
namespace details {
template <typename T>
struct NPUStatusType {};
#define DEFINE_NPU_STATUS_TYPE(type, success_value) \
template <> \
struct NPUStatusType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
} // namespace details
inline std::string build_npu_error_msg(aclError stat) {
std::ostringstream sout;
sout << " ACL error, the error code is : " << stat << ". ";
return sout.str();
}
#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __NPU_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::NPUStatusType< \
__NPU_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_npu_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
#endif // PADDLE_WITH_ASCEND_CL
} // namespace platform
} // namespace paddle
......@@ -45,7 +45,10 @@ DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
/**
* CUDA related related FLAG
......@@ -84,8 +87,15 @@ DEFINE_string(selected_gpus, "",
"share-memory only.");
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_ASCEND_CL)
DEFINE_string(selected_npus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string.");
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* CUDNN related FLAG
* Name: FLAGS_cudnn_deterministic
......@@ -377,7 +387,10 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
/**
* Memory related FLAG
......
......@@ -102,6 +102,7 @@ static int GetCUDADeviceCountImpl() {
}
int GetCUDADeviceCount() {
// cache the count
static auto dev_cnt = GetCUDADeviceCountImpl();
return dev_cnt;
}
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/string/split.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -63,6 +65,7 @@ namespace framework {
std::once_flag gflags_init_flag;
std::once_flag glog_init_flag;
std::once_flag npu_init_flag;
bool InitGflags(std::vector<std::string> args) {
bool successed = false;
......@@ -145,6 +148,17 @@ void InitDevices() {
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
platform::AclInstance::Instance(); // NOLINT
try {
// use user specified XPUs in single-node multi-process mode.
devices = platform::GetSelectedNPUDevices();
} catch (const std::exception &exp) {
LOG(WARNING)
<< "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
}
#endif
InitDevices(devices);
}
......@@ -165,6 +179,9 @@ void InitDevices(const std::vector<int> devices) {
#endif
#ifdef PADDLE_WITH_XPU
places.emplace_back(platform::XPUPlace(devices[i]));
#endif
#ifdef PADDLE_WITH_ASCEND_CL
places.emplace_back(platform::NPUPlace(devices[i]));
#endif
}
places.emplace_back(platform::CPUPlace());
......
......@@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
DEFINE_INT_STATUS(STAT_gpu13_mem_size)
DEFINE_INT_STATUS(STAT_gpu14_mem_size)
DEFINE_INT_STATUS(STAT_gpu15_mem_size)
// For Ascend NPU
DEFINE_INT_STATUS(STAT_npu0_mem_size)
DEFINE_INT_STATUS(STAT_npu1_mem_size)
DEFINE_INT_STATUS(STAT_npu2_mem_size)
DEFINE_INT_STATUS(STAT_npu3_mem_size)
DEFINE_INT_STATUS(STAT_npu4_mem_size)
DEFINE_INT_STATUS(STAT_npu5_mem_size)
DEFINE_INT_STATUS(STAT_npu6_mem_size)
DEFINE_INT_STATUS(STAT_npu7_mem_size)
......@@ -187,3 +187,13 @@ class StatRegistry {
USE_INT_STAT(STAT_gpu13_mem_size); \
USE_INT_STAT(STAT_gpu14_mem_size); \
USE_INT_STAT(STAT_gpu15_mem_size)
#define USE_NPU_MEM_STAT \
USE_INT_STAT(STAT_npu0_mem_size); \
USE_INT_STAT(STAT_npu1_mem_size); \
USE_INT_STAT(STAT_npu2_mem_size); \
USE_INT_STAT(STAT_npu3_mem_size); \
USE_INT_STAT(STAT_npu4_mem_size); \
USE_INT_STAT(STAT_npu5_mem_size); \
USE_INT_STAT(STAT_npu6_mem_size); \
USE_INT_STAT(STAT_npu7_mem_size)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/npu_info.h"
#include <algorithm>
#include <cstdlib>
#include <memory>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/string/split.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_uint64(gpu_memory_limit_mb);
DECLARE_string(selected_npus);
constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_NPU_MEM_STAT;
namespace paddle {
namespace platform {
static int GetNPUDeviceCountImpl() {
uint32_t count;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count));
return count;
}
int GetNPUDeviceCount() {
static auto dev_cnt = GetNPUDeviceCountImpl();
return dev_cnt;
}
int NPUCanAccessPeer(int src, int dst) {
int can = 0;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst));
return can;
}
// For example, "1.0.1"
std::string GetNPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than NPU count, "
"but received id is: %d. NPU count is: %d.",
id, GetNPUDeviceCount()));
int major = 0, minor = 0, patch = 0;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch));
return string::Sprintf("%d.%d.%d", major, minor, patch);
}
int GetCurrentNPUDeviceId() {
int device_id;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id));
return device_id;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedNPUDevices() {
// use user specified NPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_npus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetNPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void SetNPUDeviceId(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than NPU count, "
"but received id is: %d. NPU count is: %d.",
id, GetNPUDeviceCount()));
// NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice
// pairly.
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id));
}
void ResetNPUDeviceId(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than NPU count, "
"but received id is: %d. NPU count is: %d.",
id, GetNPUDeviceCount()));
PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id));
}
void NPUMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total;
RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total,
platform::GetCurrentNPUDeviceId());
}
size_t NPUAvailableMemToAlloc() {
size_t total = 0;
size_t available = 0;
NPUMemoryUsage(&available, &total);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = NPUMinChunkSize();
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20)
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
return available_to_alloc;
}
size_t NPUMaxAllocSize() {
return std::max(NPUInitAllocSize(), NPUReallocSize());
}
static size_t NPUAllocSize(bool realloc) {
size_t available_to_alloc = NPUAvailableMemToAlloc();
PADDLE_ENFORCE_GT(
available_to_alloc, 0,
platform::errors::ResourceExhausted("Not enough available NPU memory."));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction
size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
: FLAGS_initial_gpu_memory_in_mb;
size_t alloc_bytes =
(flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE(
available_to_alloc, alloc_bytes,
platform::errors::ResourceExhausted("Not enough available NPU memory."));
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
<< " MiB, is it Re-alloc: " << realloc;
return alloc_bytes;
}
size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
size_t NPUMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t NPUMaxChunkSize() {
size_t max_chunk_size = NPUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
return max_chunk_size;
}
void NPUMemcpyAsync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, aclrtStream stream,
size_t dst_max_count) {
dst_max_count = dst_max_count ? dst_max_count : count;
VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
<< kind << " " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
}
void NPUMemcpySync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, size_t dst_max_count) {
// NOTE(zhiqiu): The default max_count is count
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
}
void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src,
size_t count, enum aclrtMemcpyKind kind,
aclrtStream stream, size_t dst_max_count) {
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
}
void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count,
enum aclrtMemcpyKind kind, size_t dst_max_count) {
// NOTE(zhiqiu): The default max_count is count
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
}
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count) {
max_count = max_count ? max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemsetAsync(dst, max_count, value, count, stream));
}
void NPUStreamSync(aclrtStream stream) {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
}
static void RaiseNonOutOfMemoryError(aclError *status) {
if (*status == ACL_ERROR_BAD_ALLOC) {
*status = ACL_ERROR_NONE;
}
PADDLE_ENFORCE_NPU_SUCCESS(*status);
}
class RecordedNPUMallocHelper {
private:
explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0)
: dev_id_(dev_id), limit_size_(limit_size) {
if (NeedRecord()) {
mtx_.reset(new std::mutex());
}
}
DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper);
public:
static RecordedNPUMallocHelper *Instance(int dev_id) {
std::call_once(once_flag_, [] {
int dev_cnt = GetNPUDeviceCount();
instances_.reserve(dev_cnt);
for (int i = 0; i < dev_cnt; ++i) {
// NOTE(zhiqiu): share the flags with gpu, avoid more flags.
instances_.emplace_back(
new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
}
});
PADDLE_ENFORCE_GE(
dev_id, 0,
platform::errors::OutOfRange(
"Device id must be not less than 0, but got %d.", dev_id));
PADDLE_ENFORCE_LT(
dev_id, instances_.size(),
platform::errors::OutOfRange("Device id %d exceeds npu card number %d.",
dev_id, instances_.size()));
return instances_[dev_id].get();
}
/**
* Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC
* or ACL_ERROR_NONE would be returned.
*/
aclError Malloc(void **ptr, size_t size) {
LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
return ACL_ERROR_BAD_ALLOC;
}
NPUDeviceGuard guard(dev_id_);
auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
if (result == ACL_ERROR_NONE) {
if (NeedRecord()) {
cur_size_ += size;
}
STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
return result;
} else {
RaiseNonOutOfMemoryError(&result);
// Non out of memory error would be raised inside
// RaiseNonOutOfMemoryError. Therefore, we can
// return cudaErrorMemoryAllocation directly here.
return ACL_ERROR_BAD_ALLOC;
}
}
/**
* Free gpu memory. Usually, free is not allowed to raise error.
* If it does raise error, the process should be crashed.
*/
void Free(void *ptr, size_t size) {
NPUDeviceGuard guard(dev_id_);
auto result = aclrtFree(ptr);
PADDLE_ENFORCE_NPU_SUCCESS(result);
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
cur_size_ -= size;
}
STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
}
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total) {
{
NPUDeviceGuard guard(dev_id_);
auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total);
if (result != ACL_ERROR_NONE) {
*actual_avail = 0;
}
RaiseNonOutOfMemoryError(&result);
}
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
*avail = std::min(*actual_avail, limit_size_ - cur_size_);
*total = std::min(*actual_total, limit_size_);
return *total < *actual_total;
} else {
*avail = *actual_avail;
*total = *actual_total;
return false;
}
}
inline bool NeedRecord() const { return limit_size_ != 0; }
uint64_t RecordedSize() const {
LockGuardPtr<std::mutex> lock(mtx_);
return NeedRecord() ? cur_size_ : 0;
}
uint64_t LimitSize() const { return limit_size_; }
private:
const int dev_id_;
const uint64_t limit_size_;
uint64_t cur_size_{0};
mutable std::unique_ptr<std::mutex> mtx_;
static std::once_flag once_flag_;
static std::vector<std::unique_ptr<RecordedNPUMallocHelper>> instances_;
};
std::once_flag RecordedNPUMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedNPUMallocHelper>>
RecordedNPUMallocHelper::instances_;
aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size);
}
void RecordedNPUFree(void *p, size_t size, int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size);
}
bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total, int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo(
avail, total, actual_avail, actual_total);
}
uint64_t RecordedNPUMallocSize(int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize();
}
bool IsNPUMallocRecorded(int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
}
AclInstance::~AclInstance() {}
AclInstance &AclInstance::Instance() {
static AclInstance instance;
return instance;
}
AclInstance::AclInstance() {
PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
VLOG(4) << "Call aclrtSetDevice ";
// NOTE(zhiqiu): why set devices here?
// Because ACL creates a default context which contains 2 streams
// when calling aclrtSetDeviceId, so usually we do not need to
// create contexts explicitly. And, for each device, aclrtSetDeviceId
// need to call parily with aclrtResetDeviceId to destory the default
// context. Here, we use this singleton and static instance to manage
// the devices to make sure they will be resetted before program exit.
devices_ = platform::GetSelectedNPUDevices();
for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
SetNPUDeviceId(*it);
VLOG(4) << "Call aclrtSetDevice " << *it;
}
}
void AclInstance::Finalize() {
// NOTE(zhiqiu): DO NOT perform finalize in destructor
// to avoid problems caused by destructor order of static
// object.
for (size_t i = 0; i < devices_.size(); ++i) {
auto status = aclrtResetDevice(devices_[i]);
VLOG(4) << "Call aclrtResetDevice " << devices_[i]
<< " status = " << status;
}
auto status = aclFinalize();
VLOG(4) << "Call aclFinalize, status = " << status;
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <stddef.h>
#include <string>
#include <vector>
#include "acl/acl.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
//! Get the total number of NPU devices in system.
int GetNPUDeviceCount();
//! Get the runtime version of the ith NPU
std::string GetNPURuntimeVersion(int id);
//! Check if this device can access peer or not.
int NPUCanAccessPeer(int src, int dst);
//! Get the current NPU device id in system.
int GetCurrentNPUDeviceId();
//! Get the current NPU stream.
int GetCurrentStream();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedNPUDevices();
//! Set the NPU device id for next execution.
void SetNPUDeviceId(int device_id);
//! Reset the NPU device id for next execution.
void ResetNPUDeviceId(int device_id);
//! Get the memory usage of current NPU device.
void NPUMemoryUsage(size_t *available, size_t *total);
//! Get the available memory to allocate, which is the size of available npu
//! minus reserving.
size_t NPUAvailableMemToAlloc();
//! Get the maximum allocation size of current NPU device.
size_t NPUMaxAllocSize();
//! Get the initial allocation size of current NPU device.
size_t NPUInitAllocSize();
//! Get the re-allocation size of current NPU device.
size_t NPUReallocSize();
//! Get the minimum chunk size for NPU buddy allocator.
size_t NPUMinChunkSize();
//! Get the maximum chunk size for NPU buddy allocator.
size_t NPUMaxChunkSize();
//! Copy memory from address src to dst asynchronously.
void NPUMemcpyAsync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, aclrtStream stream,
size_t dst_max_count = 0);
//! Copy memory from address src to dst synchronously.
void NPUMemcpySync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, size_t dst_max_count = 0);
//! Set memory dst with value count size asynchronously
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count = 0);
//! Copy memory from one device to another device asynchronously.
void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, aclrtStream stream,
size_t max_count = 0);
//! Copy memory from one device to another device synchronously.
void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count, size_t max_count = 0);
//! Blocks until stream has completed all operations.
void NPUStreamSync(aclrtStream stream);
//! aclrtMalloc with recorded info
aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id);
//! aclrtFree with recorded info
void RecordedNPUFree(void *p, size_t size, int dev_id);
//! Get available and total gpu memory with considering limitation
bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total, int dev_id);
//! Get recorded actrtMalloc size. If record is disabled, return 0.
uint64_t RecordedNPUMallocSize(int dev_id);
bool IsNPUMallocRecorded(int dev_id);
class NPUDeviceGuard {
public:
explicit inline NPUDeviceGuard(int dev_id) {
int prev_id = platform::GetCurrentNPUDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
platform::SetNPUDeviceId(dev_id);
}
}
inline ~NPUDeviceGuard() {
if (prev_id_ != -1) {
platform::SetNPUDeviceId(prev_id_);
}
}
NPUDeviceGuard(const NPUDeviceGuard &o) = delete;
NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete;
private:
int prev_id_{-1};
};
class AclInstance {
public:
// NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
// no PADDLE_ENFORCE here, call acl API directly.
~AclInstance();
AclInstance(const AclInstance &o) = delete;
const AclInstance &operator=(const AclInstance &o) = delete;
static AclInstance &Instance();
void Finalize();
private:
// forbid calling default constructor
AclInstance();
std::vector<int> devices_;
};
} // namespace platform
} // namespace paddle
#endif
......@@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> {
os_ << "CUDAPlace(" << p.device << ")";
}
void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
private:
......@@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) {
return boost::apply_visitor(IsXPUPlace(), p);
}
bool is_npu_place(const Place &p) {
return boost::apply_visitor(IsNPUPlace(), p);
}
bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
......@@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
return true;
} else if (is_xpu_place(p1)) {
return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
} else if (is_npu_place(p1)) {
return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
} else {
return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
}
......
......@@ -72,16 +72,31 @@ struct XPUPlace {
int device;
};
struct NPUPlace {
NPUPlace() : NPUPlace(0) {}
explicit NPUPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; }
// needed for variant equality comparison
inline bool operator==(const NPUPlace &o) const { return device == o.device; }
inline bool operator!=(const NPUPlace &o) const { return !(*this == o); }
inline bool operator<(const NPUPlace &o) const { return device < o.device; }
int device;
};
struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &cpu) const { return true; }
bool operator()(const CPUPlace &) const { return true; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
struct IsXPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &xpu) const { return true; }
bool operator()(const XPUPlace &) const { return true; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
class Place
: public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
struct IsNPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
CUDAPinnedPlace> {
private:
using PlaceBase =
boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
public:
Place() = default;
Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT
Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT
Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT
Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT
Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT
: PlaceBase(cuda_pinned_place) {}
......@@ -126,6 +152,7 @@ using PlaceList = std::vector<Place>;
bool is_gpu_place(const Place &);
bool is_xpu_place(const Place &);
bool is_npu_place(const Place &);
bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool places_are_same_class(const Place &, const Place &);
......@@ -153,6 +180,16 @@ struct PlaceVisitorWrapper
#endif
}
typename Visitor::result_type operator()(const NPUPlace &npu) const {
#ifdef PADDLE_WITH_ASCEND
return visitor_(npu);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu device"));
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda);
......
IF(WITH_GPU OR WITH_ROCM)
cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost)
ENDIF()
IF(WITH_ASCEND_CL)
cc_library(npu_stream SRCS npu_stream.cc DEPS enforce boost stream_callback_manager)
ENDIF()
......@@ -49,8 +49,8 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) {
cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
#endif
}
callback_manager_.reset(new StreamCallbackManager(stream_));
VLOG(3) << "CUDAStream Init stream: " << stream_
callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
VLOG(3) << "GPUStream Init stream: " << stream_
<< ", priority: " << static_cast<int>(priority);
return true;
}
......
......@@ -101,7 +101,7 @@ class CUDAStream final {
cudaStream_t stream_{nullptr};
#endif
Priority priority_{Priority::kNormal};
std::unique_ptr<StreamCallbackManager> callback_manager_;
std::unique_ptr<StreamCallbackManager<gpuStream_t>> callback_manager_;
DISABLE_COPY_AND_ASSIGN(CUDAStream);
};
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/stream/npu_stream.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace platform {
namespace stream {
bool NPUStream::Init(const Place& place) {
PADDLE_ENFORCE_EQ(is_npu_place(place), true,
platform::errors::InvalidArgument(
"NPU stream must be created using npu place."));
place_ = place;
NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream_));
callback_manager_.reset(new StreamCallbackManager<aclrtStream>(stream_));
VLOG(3) << "NPUStream Init stream: " << stream_;
return true;
}
void NPUStream::Destroy() {
NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
Wait();
WaitCallback();
if (stream_) {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream_));
}
stream_ = nullptr;
}
void NPUStream::Wait() const {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_));
}
} // namespace stream
} // namespace platform
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
#include <memory>
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream_callback_manager.h"
namespace paddle {
namespace platform {
namespace stream {
#ifdef PADDLE_WITH_ASCEND_CL
class NPUStream final {
public:
NPUStream() = default;
explicit NPUStream(const Place& place) { Init(place); }
virtual ~NPUStream() { Destroy(); }
bool Init(const Place& place);
template <typename Callback>
void AddCallback(Callback&& callback) const {
callback_manager_->AddCallback(callback);
}
template <typename Callback>
void RecordEvent(aclrtEvent ev, Callback callback) const {
callback();
PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_));
}
void RecordEvent(aclrtEvent ev) const {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_));
}
void WaitEvent(aclrtEvent ev) const {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream_, ev));
}
void Wait() const;
void WaitCallback() const { callback_manager_->Wait(); }
aclrtStream raw_stream() const { return stream_; }
void Destroy();
private:
Place place_;
aclrtStream stream_{nullptr};
std::unique_ptr<StreamCallbackManager<aclrtStream>> callback_manager_;
DISABLE_COPY_AND_ASSIGN(NPUStream);
};
#endif
} // namespace stream
} // namespace platform
} // namespace paddle
......@@ -21,11 +21,18 @@ namespace platform {
#ifdef PADDLE_WITH_HIP
static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
void *user_data)
#elif CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void *user_data)
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void *user_data)
#else
static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
cudaError_t status, void *user_data)
static void CUDART_CB
StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data)
#endif
#endif
#if PADDLE_WITH_ASCEND_CL
static void StreamCallbackFunc(void *user_data)
#endif
{
std::unique_ptr<std::function<void()>> func(
......@@ -33,10 +40,13 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
(*func)();
}
StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
template <typename Stream>
StreamCallbackManager<Stream>::StreamCallbackManager(const Stream stream)
: stream_(stream), thread_pool_(1) {}
void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
template <typename Stream>
void StreamCallbackManager<Stream>::AddCallback(
std::function<void()> callback) const {
auto *callback_func = new std::function<void()>(std::move(callback));
auto *func = new std::function<void()>([this, callback_func] {
std::lock_guard<std::mutex> lock(mtx_);
......@@ -45,23 +55,37 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
(*callback_func)();
});
});
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
#elif CUDA_VERSION >= 10000
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
#endif
#endif
#if PADDLE_WITH_ASCEND_CL
PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func,
ACL_CALLBACK_BLOCK, stream_));
#endif
}
void StreamCallbackManager::Wait() const {
template <typename Stream>
void StreamCallbackManager<Stream>::Wait() const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
#else
#endif
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
#endif
#ifdef PADDLE_WITH_ASCEND_CL
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_));
#endif
{
std::lock_guard<std::mutex> lock(mtx_);
......@@ -71,5 +95,15 @@ void StreamCallbackManager::Wait() const {
}
}
#ifdef PADDLE_WITH_CUDA
template struct StreamCallbackManager<gpuStream_t>;
#endif
#ifdef PADDLE_WITH_HIP
template struct StreamCallbackManager<hipStream_t>;
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template struct StreamCallbackManager<aclrtStream>;
#endif
} // namespace platform
} // namespace paddle
......@@ -37,9 +37,10 @@ namespace platform {
// NOTE(zjl): clean StreamCallbackManager to make compilation faster
// Make StreamCallbackManager thread-safe
template <typename Stream>
class StreamCallbackManager {
public:
explicit StreamCallbackManager(const gpuStream_t stream);
explicit StreamCallbackManager(const Stream stream);
~StreamCallbackManager() = default;
......@@ -48,7 +49,7 @@ class StreamCallbackManager {
void Wait() const;
private:
const gpuStream_t stream_;
const Stream stream_;
mutable ::ThreadPool thread_pool_;
mutable std::mutex mtx_;
mutable std::future<void> last_future_;
......
......@@ -88,10 +88,17 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
// others
DECLARE_bool(sync_nccl_allreduce);
#endif
#ifdef PADDLE_WITH_XPU
// device management
DECLARE_string(selected_xpus);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// device management
DECLARE_string(selected_npus);
#endif
#ifdef PADDLE_WITH_DISTRIBUTE
DECLARE_int32(rpc_send_thread_num);
DECLARE_int32(rpc_get_thread_num);
......@@ -374,6 +381,11 @@ static void RegisterGlobalVarGetterSetter() {
#ifdef PADDLE_WITH_XPU
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
#endif
#ifdef PADDLE_WITH_DITRIBUTE
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
FLAGS_rpc_get_thread_num,
......
......@@ -107,6 +107,10 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/npu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
......@@ -163,6 +167,14 @@ bool IsCompiledWithXPU() {
#endif
}
bool IsCompiledWithNPU() {
#ifndef PADDLE_WITH_ASCEND_CL
return false;
#else
return true;
#endif
}
bool IsCompiledWithMKLDNN() {
#ifndef PADDLE_WITH_MKLDNN
return false;
......@@ -569,6 +581,11 @@ PYBIND11_MODULE(core_noavx, m) {
make_ddim(x_dim), make_ddim(y_dim), -1));
});
#ifdef PADDLE_WITH_ASCEND_CL
m.def("_npu_finalize",
[]() { platform::AclInstance::Instance().Finalize(); });
#endif
m.def(
"_append_python_callable_object_and_return_id",
[](py::object py_obj) -> size_t {
......@@ -641,6 +658,10 @@ PYBIND11_MODULE(core_noavx, m) {
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::NPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_double",
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<double>(place);
......@@ -688,12 +709,19 @@ PYBIND11_MODULE(core_noavx, m) {
return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
})
.def("_clear", &framework::Tensor::clear)
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::NPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
})
.def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
R"DOC(
......@@ -701,7 +729,7 @@ PYBIND11_MODULE(core_noavx, m) {
Args:
lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the
place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
LoDTensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False.
......@@ -1429,6 +1457,18 @@ All parameter, weight, gradient are variables in Paddle.
return new paddle::platform::XPUDeviceContext(place);
#endif
})
.def_static("create",
[](paddle::platform::NPUPlace& place)
-> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW(
platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
#else
return new paddle::platform::NPUDeviceContext(place);
#endif
})
.def_static("create",
[](paddle::platform::CUDAPlace& place)
-> paddle::platform::DeviceContext* {
......@@ -1529,6 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("_get_device_id",
......@@ -1598,6 +1639,7 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_XPU
m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
#endif
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
CPUPlace is a descriptor of a device.
It represents a CPU device on which a tensor will be allocated and a model will run.
......@@ -1613,6 +1655,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_type", &PlaceIndex<platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
.def("_equals",
......@@ -1650,6 +1693,8 @@ All parameter, weight, gradient are variables in Paddle.
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
.def("_equals",
......@@ -1657,6 +1702,65 @@ All parameter, weight, gradient are variables in Paddle.
.def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
// NPUPlace
py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
NPUPlace is a descriptor of a device.
It represents a NPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
npu_place = paddle.NPUPlace(0)
)DOC")
.def("__init__",
[](platform::NPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_ASCEND_CL
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
if (platform::GetNPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use NPU because there is no NPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d",
dev_id, platform::GetNPUDeviceCount(),
platform::GetNPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::NPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use NPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use NPU, please try to install NPU version "
"PaddlePaddle by: pip install paddlepaddle-xpu\n"
"If you only have CPU, please change NPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::NPUPlace &>);
py::class_<platform::Place>(m, "Place")
.def(py::init<>())
.def("_type", &PlaceIndex<platform::Place>)
......@@ -1664,6 +1768,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
......@@ -1671,6 +1776,8 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) { return platform::is_cpu_place(self); })
.def("is_xpu_place",
[](platform::Place &self) { return platform::is_xpu_place(self); })
.def("is_npu_place",
[](platform::Place &self) { return platform::is_npu_place(self); })
.def("is_cuda_pinned_place",
[](platform::Place &self) {
return platform::is_cuda_pinned_place(self);
......@@ -1683,6 +1790,10 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) {
return BOOST_GET_CONST(platform::XPUPlace, self).device;
})
.def("npu_device_id",
[](platform::Place &self) {
return BOOST_GET_CONST(platform::NPUPlace, self).device;
})
.def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; })
.def("set_place",
......@@ -1702,6 +1813,10 @@ All parameter, weight, gradient are variables in Paddle.
const platform::CUDAPinnedPlace &cuda_pinned_place) {
self = cuda_pinned_place;
})
.def("set_place",
[](platform::Place &self, const platform::NPUPlace &npu_place) {
self = npu_place;
})
.def("__repr__", string::to_string<const platform::Place &>)
.def("__str__", string::to_string<const platform::Place &>);
......@@ -1726,6 +1841,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::XPUPlace &place) { self.Run(scope, place); })
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::NPUPlace &place) { self.Run(scope, place); })
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CUDAPlace &place) { self.Run(scope, place); })
......@@ -1828,6 +1946,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_ascend", IsCompiledWithAscend);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_npu", IsCompiledWithNPU);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("supports_bfloat16", SupportsBfloat16);
......
......@@ -294,6 +294,22 @@ void SetTensorFromPyArrayT(
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
platform::Place tmp_place = place;
platform::NPUDeviceGuard guard(
BOOST_GET_CONST(platform::NPUPlace, tmp_place).device);
auto dst = self->mutable_data<T>(place);
platform::NPUMemcpySync(dst, array.data(), array.nbytes(),
ACL_MEMCPY_HOST_TO_DEVICE);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place);
ctx.Wait();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version. "
"Please recompile or reinstall Paddle with NPU support."));
#endif
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/npu_info.h"
int main(int argc, char** argv) {
paddle::memory::allocation::UseAllocatorStrategyGFlag();
......@@ -38,11 +39,13 @@ int main(int argc, char** argv) {
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
envs.push_back("fraction_of_gpu_memory_to_use");
envs.push_back("initial_gpu_memory_in_mb");
envs.push_back("reallocate_gpu_memory_in_mb");
envs.push_back("allocator_strategy");
envs.push_back("selected_gpus");
#elif __clang__
envs.push_back("use_mkldnn");
envs.push_back("initial_cpu_memory_in_mb");
......@@ -61,6 +64,10 @@ int main(int argc, char** argv) {
undefok.push_back("initial_cpu_memory_in_mb");
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
envs.push_back("selected_npus");
#endif
char* env_str = nullptr;
if (envs.size() > 0) {
std::string env_string = "--tryfromenv=";
......@@ -93,6 +100,10 @@ int main(int argc, char** argv) {
int ret = RUN_ALL_TESTS();
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::AclInstance::Instance().Finalize();
#endif
if (env_str) free(env_str);
if (undefok_str) free(undefok_str);
......
......@@ -238,6 +238,7 @@ from .framework import ParamAttr #DEFINE_ALIAS
from .framework import create_parameter #DEFINE_ALIAS
from .framework import CPUPlace #DEFINE_ALIAS
from .framework import CUDAPlace #DEFINE_ALIAS
from .framework import NPUPlace #DEFINE_ALIAS
from .framework import CUDAPinnedPlace #DEFINE_ALIAS
from .framework import grad #DEFINE_ALIAS
......@@ -262,6 +263,7 @@ from .device import set_device
from .device import get_device
from .device import is_compiled_with_cuda #DEFINE_ALIAS
from .device import is_compiled_with_xpu
from .device import is_compiled_with_npu
from .device import XPUPlace
# from .tensor.tensor import Tensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensor #DEFINE_ALIAS
......
......@@ -32,12 +32,28 @@ __all__ = [
# 'cuda_places',
# 'CUDAPinnedPlace',
# 'CUDAPlace',
'is_compiled_with_cuda'
'is_compiled_with_cuda',
'is_compiled_with_npu'
]
_cudnn_version = None
def is_compiled_with_npu():
"""
Whether this whl package can be used to run the model on NPU.
Returns (bool): `True` if NPU is supported, otherwise `False`.
Examples:
.. code-block:: python
import paddle
support_npu = paddle.is_compiled_with_npu()
"""
return core.is_compiled_with_npu()
def is_compiled_with_xpu():
"""
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
......@@ -165,6 +181,7 @@ def set_device(device):
device_id = device_info_list[1]
device_id = int(device_id)
place = core.XPUPlace(device_id)
framework._set_expected_place(place)
return place
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import paddle
from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
def _get_ascend_rankfile(rank_table_file_path):
"""
Args:
rank_table_file_path: ascend npu rank file json
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "192.168.24.217",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "192.168.26.177",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
Returns:
node_ips: node ip list
device_count: number of npu per machine
"""
json_data = None
with open(rank_table_file_path) as json_file:
json_data = json.load(json_file)
node_ips = []
device_count = 0
server_list = json_data['server_list']
for server in server_list:
node_ips.append(server['server_id'])
device_list = server['device']
device_count = len(device_list)
return node_ips, device_count
def get_cloud_cluster(rank_table_file=None,
device_mode=DeviceMode.ASCEND_NPU,
devices_per_proc=None,
start_port=6070):
"""
Args:
rank_table_file: string, ascend npu rank file path
device_mode: DeviceMode(Int)
devices_per_proc:list
start_port: the start port of current runtime env
"""
if rank_table_file:
# multi trainers
node_ips, device_count = _get_ascend_rankfile(rank_table_file)
node_index = os.environ.get("PADDLE_TRAINER_ID")
node_ip = None
if node_index is None:
_, node_ip = get_host_name_ip()
else:
node_ip = node_ips[int(node_index)]
assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
% (node_ip, node_ips)
else:
# single trainer (single ascend card)
node_ips = ["127.0.0.1"]
node_ip = node_ips[0]
device_count = 1
devices_per_proc = None
if devices_per_proc is None:
devices_per_proc = [str(x) for x in range(device_count)]
free_ports = [
x for x in range(start_port, start_port + len(devices_per_proc))
]
trainer_endpoints = []
for ip in node_ips:
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
return get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
devices_per_proc)
......@@ -73,6 +73,7 @@ from paddle.distributed.fleet import launch_utils
# TODO(danleifeng): Don't import * from a module
from paddle.distributed.fleet.launch_utils import *
import paddle.distributed.fleet.cloud_utils as cloud_utils
import paddle.distributed.fleet.ascend_utils as ascend_utils
def _print_arguments(args):
......@@ -120,7 +121,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
default=None,
help="It's for ascend npu training."
"For example:"
"--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu."
"--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
)
if fluid.core.is_compiled_with_cuda():
......@@ -237,6 +238,13 @@ def launch_collective(args):
cluster, pod = cloud_utils.get_cloud_cluster(
args.ips, device_mode, devices_per_proc, start_port)
logger.debug("get cluster from cloud:{}".format(cluster))
elif device_mode == DeviceMode.ASCEND_NPU:
# for ascend
cluster, pod = ascend_utils.get_cloud_cluster(
rank_table_file=os.getenv("RANK_TABLE_FILE", None),
device_mode=device_mode,
devices_per_proc=devices_per_proc,
start_port=start_port)
else:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster, pod = get_cluster_from_args(args, device_mode,
......
......@@ -593,8 +593,8 @@ def get_ascend_npus(npus):
if npus is None:
count = fluid.core.NPUDevice.get_device_count()
if count <= 0:
return ret
ret = [x for x in range(count)]
return None
ret = [str(x) for x in range(count)]
else:
ret = [x.strip() for x in npus.split(',')]
return ret
......
......@@ -214,7 +214,8 @@ class AscendOptimizer(Optimizer):
parameter_list=None,
no_grad_set=None,
auto_dp=False,
rank_table_file=None):
rank_table_file=None,
precision_mode="must_keep_origin_dtype"):
minimized = None
if self.inner_opt:
minimized = self.inner_opt.minimize(
......@@ -234,7 +235,7 @@ class AscendOptimizer(Optimizer):
config = {
"ge.exec.deviceId": str(fleet.local_device_ids()),
"ge.graphRunMode": "1",
"ge.exec.precision_mode": "must_keep_origin_dtype",
"ge.exec.precision_mode": precision_mode,
}
# if multi trainers
if rank_table_file and fleet.world_size() > 1:
......
......@@ -200,7 +200,8 @@ class AscendParserBase(object):
def _accumulated_op_id(self):
global global_cnt
global_cnt += 1
return "." + str(global_cnt)
name = "." + str(global_cnt)
return name
def _create_ge_tensor(self, shape, dtype, value):
tensor_desc = core.GETensorDesc(
......@@ -1622,10 +1623,14 @@ class MulGradParser(AscendParserBase):
"unsqueeze" + self._accumulated_op_id(),
"Unsqueeze").set_input("x",
y).set_attr_vec_int32("axes", [0])
y_stack = core.GEOperatorFactory.create_operator(
"stack" + self._accumulated_op_id(),
"TileWithAxis").set_input("x", y_unsqueeze).set_attr_int32(
"axis", 0).set_attr_int32("tiles", shape_out_grad[0])
x_grad = core.GEOperatorFactory.create_operator(
self.parser_name + self._accumulated_op_id(),
"BatchMatMul").set_input("x1", out_grad).set_input(
"x2", y_unsqueeze).set_attr_bool(
"x2", y_stack).set_attr_bool(
"adj_x1", False).set_attr_bool("adj_x2", True)
y_grad = core.GEOperatorFactory.create_operator(
self.parser_name + self._accumulated_op_id(),
......
......@@ -68,7 +68,8 @@ from .input import embedding, one_hot
from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
from .core import LoDTensor, LoDTensorArray, Scope, _Scope
from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
from .incubate import fleet
from .incubate import data_generator
from .transpiler import DistributeTranspiler, \
......@@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'XPUPlace',
'CUDAPlace',
'CUDAPinnedPlace',
'NPUPlace',
'Tensor',
'ParamAttr',
'WeightNormParamAttr',
......@@ -232,6 +234,16 @@ def __bootstrap__():
'gpu_memory_limit_mb',
'conv2d_disable_cudnn',
]
if core.is_compiled_with_npu():
read_env_flags += [
'selected_npus',
'fraction_of_gpu_memory_to_use',
'initial_gpu_memory_in_mb',
'reallocate_gpu_memory_in_mb',
'gpu_memory_limit_mb',
]
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
# don't init_p2p when in unittest to save time.
......
......@@ -1213,6 +1213,7 @@ class Executor(object):
# In distributed training, the compiled program is saved in Program._graph
has_compiled_graph = isinstance(program._graph,
compiler.CompiledProgram)
if has_compiled_graph:
program._graph._compile(scope, self.place)
# _graph in program does not support inference since the _graph is optimized
......
......@@ -6201,7 +6201,7 @@ def _get_paddle_place(place):
if place is None:
return place
if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
core.CUDAPinnedPlace, core.CUDAPlace)):
core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)):
return place
if not isinstance(place, str):
......@@ -6211,9 +6211,11 @@ def _get_paddle_place(place):
place = place.lower()
if (place == "cpu"):
return core.CPUPlace()
if (place == "device"):
return core.Place()
# GPU
avaliable_gpu_place = re.match(r'gpu:\d+', place)
if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
if not core.is_compiled_with_cuda():
......@@ -6229,6 +6231,8 @@ def _get_paddle_place(place):
device_id = place_info_list[1]
device_id = int(device_id)
return core.CUDAPlace(device_id)
# XPU
avaliable_xpu_place = re.match(r'xpu:\d+', place)
if avaliable_xpu_place:
if not core.is_compiled_with_xpu():
......@@ -6239,9 +6243,22 @@ def _get_paddle_place(place):
device_id = place_info_list[1]
device_id = int(device_id)
return core.XPUPlace(device_id)
# NPU
avaliable_npu_place = re.match(r'npu:\d+', place)
if avaliable_npu_place:
if not core.is_compiled_with_npu():
raise ValueError(
"The device should not be {}, since PaddlePaddle is " \
"not compiled with NPU".format(avaliable_npu_place))
place_info_list = place.split(':', 1)
device_id = place_info_list[1]
device_id = int(device_id)
return core.NPUPlace(device_id)
raise ValueError(
"paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input"
)
"Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.".
format(place))
def _get_paddle_place_list(places):
......
......@@ -625,6 +625,10 @@ if (WITH_XPU)
add_subdirectory(xpu)
endif()
if (WITH_ASCEND_CL)
add_subdirectory(npu)
endif()
if (WITH_MKLDNN)
add_subdirectory(mkldnn)
endif()
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest, _set_use_system_allocator
import paddle
import paddle.fluid as fluid
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestElementwiseAddOp(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_add"
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_input_output()
self.init_kernel_type()
self.init_axis()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out}
def set_npu(self):
self.__class__.use_npu = True
def init_kernel_type(self):
self.use_mkldnn = False
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.add(self.x, self.y)
def init_dtype(self):
self.dtype = np.float32
def init_axis(self):
self.axis = -1
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)
# TODO(ascendrc): Test grad op after it is implemented.
# def test_check_grad_normal(self):
# self.check_grad_with_place(
# self.place, ['X', 'Y'],
# 'Out',
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_x(self):
# self.check_grad_with_place(
# self.place, ['Y'],
# 'Out',
# no_grad_set=set("X"),
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_y(self):
# self.check_grad_with_place(
# self.place, ['X'],
# 'Out',
# no_grad_set=set("Y"),
# max_relative_error=0.006,check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestAddAPI(unittest.TestCase):
def test_name(self):
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
y_1 = paddle.add(x, y, name='add_res')
self.assertEqual(('add_res' in y_1.name), True)
def test_static(self):
with paddle.static.program_guard(paddle.static.Program()):
x_np = np.array([2, 3, 4]).astype('float32')
y_np = np.array([1, 5, 2]).astype('float32')
x = paddle.static.data(name="x", shape=[3], dtype='float32')
y = paddle.static.data(name="y", shape=[3], dtype='float32')
x_reshape = paddle.reshape(x, [3, 1])
y_reshape = paddle.reshape(y, [3, 1])
z = paddle.add(x_reshape, y_reshape)
z = paddle.reshape(z, shape=[3])
place = paddle.NPUPlace(0)
exe = paddle.static.Executor(place)
x_value, y_value, z_value = exe.run(feed={"x": x_np,
"y": y_np},
fetch_list=[x, y, z])
z_expected = np.array([3., 8., 6.])
self.assertEqual(
(x_value == x_np).all(),
True,
msg="x_value = {}, but expected {}".format(x_value, x_np))
self.assertEqual(
(y_value == y_np).all(),
True,
msg="y_value = {}, but expected {}".format(y_value, y_np))
self.assertEqual(
(z_value == z_expected).all(),
True,
msg="z_value = {}, but expected {}".format(z_value, z_expected))
def test_backward(self):
# TODO(ascendrc): Test backward after add grad npu op implemented.
pass
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestAddError(unittest.TestCase):
def test_errors(self):
with paddle.static.program_guard(paddle.static.Program()):
# the input of elementwise_add must be Variable.
x1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
y1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
self.assertRaises(TypeError, paddle.add, x1, y1)
# the input dtype must be float16 or float32 or float64 or int32 or int64
x2 = paddle.static.data(
name='x2', shape=[3, 4, 5, 6], dtype="uint8")
y2 = paddle.static.data(
name='y2', shape=[3, 4, 5, 6], dtype="uint8")
self.assertRaises(TypeError, paddle.add, x2, y2)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
paddle.enable_static()
SEED = 2021
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestElementwiseSubOp(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_sub"
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_input_output()
self.init_kernel_type()
self.init_axis()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out}
def set_npu(self):
self.__class__.use_npu = True
def init_kernel_type(self):
self.use_mkldnn = False
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.subtract(self.x, self.y)
def init_dtype(self):
self.dtype = np.float32
def init_axis(self):
self.axis = 0
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)
# TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault
# when call op.run, which may be caused by system environment exception
# and the exact cause has not be located.
# def test_check_grad_normal(self):
# self.check_grad_with_place(
# self.place, ['X', 'Y'],
# 'Out',
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_x(self):
# self.check_grad_with_place(
# self.place, ['Y'],
# 'Out',
# no_grad_set=set("X"),
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_y(self):
# self.check_grad_with_place(
# self.place, ['X'],
# 'Out',
# no_grad_set=set("Y"),
# max_relative_error=0.006,check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSubtractAPI(unittest.TestCase):
def test_name(self):
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
y_1 = paddle.subtract(x, y, name='add_res')
self.assertEqual(('add_res' in y_1.name), True)
def test_static(self):
with paddle.static.program_guard(paddle.static.Program()):
x_np = np.array([2, 3, 4]).astype('float32')
y_np = np.array([1, 5, 2]).astype('float32')
x = paddle.static.data(name="x", shape=[3], dtype='float32')
y = paddle.static.data(name="y", shape=[3], dtype='float32')
x_reshape = paddle.reshape(x, [3, 1])
y_reshape = paddle.reshape(y, [3, 1])
z = paddle.subtract(x_reshape, y_reshape)
z = paddle.reshape(z, shape=[3])
place = paddle.NPUPlace(0)
exe = paddle.static.Executor(place)
x_value, y_value, z_value = exe.run(feed={"x": x_np,
"y": y_np},
fetch_list=[x, y, z])
z_expected = np.array([1., -2., 2.])
self.assertEqual(
(x_value == x_np).all(),
True,
msg="x_value = {}, but expected {}".format(x_value, x_np))
self.assertEqual(
(y_value == y_np).all(),
True,
msg="y_value = {}, but expected {}".format(y_value, y_np))
self.assertEqual(
(z_value == z_expected).all(),
True,
msg="z_value = {}, but expected {}".format(z_value, z_expected))
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSubtractError(unittest.TestCase):
def test_errors(self):
with paddle.static.program_guard(paddle.static.Program()):
# the input of elementwise_add must be Variable.
x1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
y1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
self.assertRaises(TypeError, paddle.subtract, x1, y1)
# the input dtype must be float16 or float32 or float64 or int32 or int64
x2 = paddle.static.data(
name='x2', shape=[3, 4, 5, 6], dtype="uint8")
y2 = paddle.static.data(
name='y2', shape=[3, 4, 5, 6], dtype="uint8")
self.assertRaises(TypeError, paddle.subtract, x2, y2)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSubtractNet(unittest.TestCase):
def _test(self, run_npu=True):
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = SEED
startup_prog.random_seed = SEED
np.random.seed(SEED)
a_np = np.random.random(size=(32, 32)).astype('float32')
b_np = np.random.random(size=(32, 32)).astype('float32')
label_np = np.random.randint(2, size=(32, 1)).astype('int64')
with paddle.static.program_guard(main_prog, startup_prog):
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
label = paddle.static.data(
name="label", shape=[32, 1], dtype='int64')
sum = paddle.add(a, b)
c = paddle.assign(b)
z = paddle.subtract(sum, c)
fc_1 = fluid.layers.fc(input=z, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
sgd = fluid.optimizer.SGD(learning_rate=0.01)
sgd.minimize(loss)
if run_npu:
place = paddle.NPUPlace(0)
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
for epoch in range(100):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
if epoch % 10 == 0:
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res))
return pred_res, loss_res
def test_npu(self):
npu_pred, npu_loss = self._test(True)
cpu_pred, cpu_loos = self._test(False)
self.assertTrue(np.allclose(npu_pred, cpu_pred))
self.assertTrue(np.allclose(npu_loss, cpu_loos))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import numpy as np
from paddle.fluid import core
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNpuPlace(unittest.TestCase):
def test(self):
p = core.Place()
p.set_place(paddle.NPUPlace(0))
self.assertTrue(p.is_npu_place())
self.assertEqual(p.npu_device_id(), 0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNpuPlaceError(unittest.TestCase):
def test_static(self):
# NPU is not supported in ParallelExecutor
prog = paddle.static.Program()
with paddle.static.program_guard(prog):
x_np = np.array([2, 3, 4]).astype('float32')
y_np = np.array([1, 5, 2]).astype('float32')
x = paddle.static.data(name="x", shape=[3], dtype='float32')
y = paddle.static.data(name="y", shape=[3], dtype='float32')
z = paddle.add(x, y)
compiled_prog = paddle.static.CompiledProgram(prog)
place = paddle.NPUPlace(0)
exe = paddle.static.Executor(place)
with self.assertRaisesRegex(RuntimeError,
"NPU is not supported in ParallelExecutor"):
exe.run(compiled_prog, feed={"x": x_np, "y": y_np})
if __name__ == '__main__':
unittest.main()
......@@ -266,7 +266,10 @@ class OpTest(unittest.TestCase):
np.random.seed(123)
random.seed(124)
cls._use_system_allocator = _set_use_system_allocator(True)
if paddle.is_compiled_with_npu():
cls._use_system_allocator = _set_use_system_allocator(False)
else:
cls._use_system_allocator = _set_use_system_allocator(True)
@classmethod
def tearDownClass(cls):
......@@ -298,6 +301,9 @@ class OpTest(unittest.TestCase):
def is_rocm_op_test():
return core.is_compiled_with_rocm()
def is_npu_op_test():
return hasattr(cls, "use_npu") and cls.use_npu == True
if not hasattr(cls, "op_type"):
raise AssertionError(
"This test do not have op_type in class attrs, "
......@@ -319,7 +325,8 @@ class OpTest(unittest.TestCase):
and not hasattr(cls, 'exist_fp64_check_grad') \
and not is_xpu_op_test() \
and not is_mkldnn_op_test() \
and not is_rocm_op_test():
and not is_rocm_op_test() \
and not is_npu_op_test():
raise AssertionError(
"This test of %s op needs check_grad with fp64 precision." %
cls.op_type)
......@@ -1216,7 +1223,8 @@ class OpTest(unittest.TestCase):
# Check inplace for given op, its grad op, its grad_grad op, etc.
# No effect on original OpTest
# Currently not support ParallelExecutor on XPUPlace.
if not paddle.is_compiled_with_xpu():
if not paddle.is_compiled_with_xpu(
) and not paddle.is_compiled_with_npu():
self.check_inplace_output_with_place(
place, no_check_set=no_check_set, inplace_atol=inplace_atol)
......
......@@ -15,54 +15,39 @@
from __future__ import print_function
import unittest
from op_test import OpTest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
import warnings
import paddle
class TestStaticDeviceManage(unittest.TestCase):
def test_cpu_device(self):
paddle.set_device('cpu')
def _test_device(self, device_name, device_class):
paddle.set_device(device_name)
out1 = paddle.zeros(shape=[1, 3], dtype='float32')
out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0)
exe = paddle.fluid.Executor()
exe = paddle.static.Executor()
exe.run(paddle.fluid.default_startup_program())
res = exe.run(fetch_list=[out3])
device = paddle.get_device()
self.assertEqual(isinstance(exe.place, core.CPUPlace), True)
self.assertEqual(device, "cpu")
self.assertEqual(isinstance(exe.place, device_class), True)
self.assertEqual(device, device_name)
def test_cpu_device(self):
self._test_device("cpu", core.CPUPlace)
def test_gpu_device(self):
if core.is_compiled_with_cuda():
out1 = paddle.zeros(shape=[1, 3], dtype='float32')
out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0)
paddle.set_device('gpu:0')
exe = paddle.fluid.Executor()
exe.run(paddle.fluid.default_startup_program())
res = exe.run(fetch_list=[out3])
device = paddle.get_device()
self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
self.assertEqual(device, "gpu:0")
self._test_device("gpu:0", core.CUDAPlace)
def test_xpu_device(self):
if core.is_compiled_with_xpu():
out1 = paddle.zeros(shape=[1, 3], dtype='float32')
out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0)
paddle.set_device('xpu:0')
exe = paddle.fluid.Executor()
exe.run(paddle.fluid.default_startup_program())
res = exe.run(fetch_list=[out3])
device = paddle.get_device()
self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
self.assertEqual(device, "xpu:0")
self._test_device("xpu:0", core.XPUPlace)
class TestImperativeDeviceManage(unittest.TestCase):
......
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
RANK_TABLE_FILE_NAME="rank_table_file.json"
cat > ${RANK_TABLE_FILE_NAME} <<EOF
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "127.0.0.2",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
EOF
# set ascend rank table file env
export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
# use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="127.0.0.1,127.0.0.2"
export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
......@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: import framework api under this directory
# TODO: import framework api under this directory
__all__ = [
'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
'get_default_dtype', 'set_default_dtype'
'NPUPlace', 'get_default_dtype', 'set_default_dtype'
]
__all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
......@@ -31,6 +31,7 @@ from ..fluid.layers.tensor import create_parameter #DEFINE_ALIAS
from ..fluid.core import CPUPlace #DEFINE_ALIAS
from ..fluid.core import CUDAPlace #DEFINE_ALIAS
from ..fluid.core import CUDAPinnedPlace #DEFINE_ALIAS
from ..fluid.core import NPUPlace #DEFINE_ALIAS
from ..fluid.core import VarBase #DEFINE_ALIAS
from paddle.fluid import core #DEFINE_ALIAS
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册