From 1201cd2ef2aa11ee4de8a3d29da64bb1f1c99b6f Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 9 Feb 2021 17:50:43 +0800 Subject: [PATCH] [feature] support npu allocator, part 2 (#30972) * support npu allocator * add npu device context * fix some compile problem * fix some compile problem * add npu info * compile ok * fix include dir * support naive_best_fit_allocator * run ut ok, bug failed to exit * call aclrtResetDevice before exit * fix aclFinilize * add system allocatot test * add selected_gpus in gtest * add tensor_test for npu * support npu op, initial commit * add npu stream * add elementwise_add_op * compile ok * fix typo * fix elementwise_add_op_npu_test * support op run * test can run but failed * change aclopExecuteV2 to aclopCompileAndExecute --- cmake/external/ascend.cmake | 14 ++-- paddle/fluid/framework/fleet/ascend_wrapper.h | 8 ++- .../elementwise_add_op_npu_test.cc | 8 ++- paddle/fluid/operators/npu_op_runner.cc | 69 +++++++++++++------ paddle/fluid/operators/npu_op_runner.h | 7 +- paddle/fluid/platform/CMakeLists.txt | 2 +- paddle/fluid/platform/device_context.cc | 4 ++ paddle/fluid/platform/device_context.h | 5 ++ paddle/fluid/platform/npu_info.cc | 25 ++++++- paddle/fluid/platform/npu_info.h | 14 ++++ 10 files changed, 120 insertions(+), 36 deletions(-) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index c23d30c5b9b..e3279a1e205 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -63,16 +63,22 @@ elseif(WITH_ASCEND_CL) set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) - set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) - set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) + set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so) - set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) + set(atlas_acl_op_compiler_lib ${ATLAS_ACL_DIR}/libacl_op_compiler.so) + set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}") message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}") INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR}) ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) - add_custom_target(extern_ascend DEPENDS atlas_acl) + + ADD_LIBRARY(atlas_acl_op_compiler SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET atlas_acl_op_compiler PROPERTY IMPORTED_LOCATION ${atlas_acl_op_compiler_lib}) + add_custom_target(extern_ascend DEPENDS atlas_acl atlas_acl_op_compiler) + endif() diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index 27a9b47e630..798fe39bfff 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -59,12 +59,14 @@ class AscendInstance { std::map _GetDefaultInitSessionOptions() { std::map init_options; - //init_options["a"] = "b"; - //init_options["ge.trainFlag"] = "1"; + // init_options["a"] = "b"; + // init_options["ge.trainFlag"] = "1"; return init_options; } - ge::Status InitGEForUT() { return ge::GEInitialize(_GetDefaultInitOptions()); } + ge::Status InitGEForUT() { + return ge::GEInitialize(_GetDefaultInitOptions()); + } void InitGlobalResouces() { LOG(INFO) << "Begin ascend InitGlobalResouces"; diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc index 64915ef394d..adc31cae0ee 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc @@ -39,11 +39,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // init auto x = scope->Var("X"); auto tensor_x = x->GetMutable(); - tensor_x->Resize({10, 10}); auto y = scope->Var("Y"); auto tensor_y = y->GetMutable(); - tensor_y->Resize({10, 10}); std::vector init; for (int64_t i = 0; i < 10 * 10; ++i) { @@ -51,7 +49,11 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { } TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({10, 10}); TensorFromVector(init, ctx, tensor_y); + tensor_y->Resize({10, 10}); + + ctx.Wait(); auto place = ctx.GetPlace(); auto out = scope->Var("Out"); @@ -70,6 +72,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + EXPECT_EQ(out_vec.size(), init.size()); for (uint32_t i = 0; i < out_vec.size(); i++) { EXPECT_EQ(out_vec[i], 2.0); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index 7eb0ff68e61..5a9f8008e7b 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -14,24 +14,30 @@ limitations under the License. */ #include "paddle/fluid/operators/npu_op_runner.h" -#include #include +#include #include #include #include #include "acl/acl.h" +#include "acl/acl_op_compiler.h" + #include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace operators { -static std::map DTYPE_2_ACL_DTYPE = { - {framework::proto::VarType::BOOL, ACL_BOOL}, {framework::proto::VarType::INT16, ACL_INT16}, - {framework::proto::VarType::INT32, ACL_INT32}, {framework::proto::VarType::INT64, ACL_INT64}, - {framework::proto::VarType::FP16, ACL_FLOAT16}, {framework::proto::VarType::FP32, ACL_FLOAT}, - {framework::proto::VarType::FP64, ACL_DOUBLE}, +static std::map + DTYPE_2_ACL_DTYPE = { + {framework::proto::VarType::BOOL, ACL_BOOL}, + {framework::proto::VarType::INT16, ACL_INT16}, + {framework::proto::VarType::INT32, ACL_INT32}, + {framework::proto::VarType::INT64, ACL_INT64}, + {framework::proto::VarType::FP16, ACL_FLOAT16}, + {framework::proto::VarType::FP32, ACL_FLOAT}, + {framework::proto::VarType::FP64, ACL_DOUBLE}, }; static std::map DATA_LAYOUT_2_ACL_FORMAT = { @@ -58,18 +64,22 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } -NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {} +NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) { + attr_ = aclopCreateAttr(); +} + NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector &inputs, const std::vector &outputs, const AttributeMap &attrs) : op_type_(op_type) { + attr_ = aclopCreateAttr(); AddInputs(inputs); AddOutputs(outputs); AddAttrs(attrs); } NpuOpRunner::~NpuOpRunner() { - //TODO(zhiqiu): handle free + // TODO(zhiqiu): handle free } const std::string &NpuOpRunner::Type() { return op_type_; } @@ -84,23 +94,23 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr))); } else if (attr.type() == typeid(int64_t)) { - PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrInt( - attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr))); + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr))); } else if (attr.type() == typeid(float)) { PADDLE_ENFORCE_NPU_SUCCESS( aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr))); } else if (attr.type() == typeid(std::vector)) { auto a = BOOST_GET_CONST(std::vector, attr); std::vector cast_a; - for(auto it : a) { + for (auto it : a) { cast_a.push_back(static_cast(it)); } - PADDLE_ENFORCE_NPU_SUCCESS( - aclopSetAttrListBool(attr_, name.c_str(), cast_a.size(), cast_a.data())); + PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool( + attr_, name.c_str(), cast_a.size(), cast_a.data())); } else if (attr.type() == typeid(std::vector)) { auto a = BOOST_GET_CONST(std::vector, attr); std::vector cast_a; - for(auto it : a) { + for (auto it : a) { cast_a.push_back(static_cast(it)); } PADDLE_ENFORCE_NPU_SUCCESS( @@ -201,15 +211,22 @@ std::vector &NpuOpRunner::GetOutputDescs() { return output_descs_; } -std::vector &NpuOpRunner::GetInputBuffers() { return input_buffers_; } +std::vector &NpuOpRunner::GetInputBuffers() { + return input_buffers_; +} -std::vector &NpuOpRunner::GetOutputBuffers() { return output_buffers_; } +std::vector &NpuOpRunner::GetOutputBuffers() { + return output_buffers_; +} aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { auto dtype = ConvertToNpuDtype(tensor.type()); auto format = ConvertToNpuFormat(tensor.layout()); auto dims = framework::vectorize(tensor.dims()); + VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1] + << " " << format; + auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format); PADDLE_ENFORCE_NOT_NULL( desc, platform::errors::External("Call aclCreateTensorDesc failed.")); @@ -217,18 +234,26 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { } aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) { - auto *buffer = - aclCreateDataBuffer(tensor.Holder()->ptr(), tensor.memory_size()); + void *ptr = tensor.data(); + VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size(); + auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size()); PADDLE_ENFORCE_NOT_NULL( buffer, platform::errors::External("Call aclCreateDataBuffer failed.")); return buffer; } void NpuOpRunner::Run(aclrtStream stream) { - aclError ret = aclopExecuteV2(op_type_.c_str(), input_descs_.size(), - input_descs_.data(), input_buffers_.data(), - output_descs_.size(), output_descs_.data(), - output_buffers_.data(), attr_, stream); + VLOG(4) << "op_type: " << op_type_; + VLOG(4) << "input_desc.size: " << input_descs_.size(); + VLOG(4) << "output_desc.size: " << output_descs_.size(); + VLOG(4) << "stream: " << stream; + VLOG(4) << "attr: " << attr_; + aclError ret = aclopCompileAndExecute( + op_type_.c_str(), input_descs_.size(), input_descs_.data(), + input_buffers_.data(), output_descs_.size(), output_descs_.data(), + output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, + stream); + VLOG(4) << "after aclopCompileAndExecute"; PADDLE_ENFORCE_NPU_SUCCESS(ret); } } // namespace operators diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 2e68226ed07..c69d8441e5d 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -32,7 +32,8 @@ using AttributeMap = framework::AttributeMap; class NpuOpRunner { public: explicit NpuOpRunner(std::string op_type); - explicit NpuOpRunner(std::string op_type, const std::vector &inputs = {}, + explicit NpuOpRunner(std::string op_type, + const std::vector &inputs = {}, const std::vector &outputs = {}, const AttributeMap &attrs = {}); @@ -40,7 +41,7 @@ class NpuOpRunner { const std::string &Type(); - NpuOpRunner &AddAttr(const std::string& name, const Attribute &attr); + NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr); NpuOpRunner &AddAttrs(const AttributeMap &attrs); @@ -76,7 +77,7 @@ class NpuOpRunner { std::vector output_buffers_; std::vector input_descs_; std::vector output_descs_; - aclopAttr *attr_; + aclopAttr *attr_{nullptr}; }; } // namespace operators diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 27389c4fd65..11c7ff546cc 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -72,7 +72,7 @@ if(WITH_ASCEND) endif() if(WITH_ASCEND_CL) - cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl) + cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl atlas_acl_op_compiler) endif() add_subdirectory(dynload) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 79e606596f9..e5031acb9b4 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -83,6 +83,7 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; } DeviceContextPool* DeviceContextPool::pool = nullptr; platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { + VLOG(4) << "DeviceContextPool Get: " << place; auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { PADDLE_THROW(platform::errors::Unimplemented( @@ -243,6 +244,7 @@ NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) { // ACL creates a default context which contains 1 default stream // and 1 sync strean after aclrtSetDevice. PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_)); + stream_.reset(new stream::NPUStream(place)); } NPUDeviceContext::~NPUDeviceContext() { @@ -255,6 +257,8 @@ void NPUDeviceContext::Wait() const { PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); } +aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } + Place NPUDeviceContext::GetPlace() const { return place_; } aclrtContext* NPUDeviceContext::context() const { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 0b4ac60c836..f5fa6816b50 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -175,6 +175,9 @@ class NPUDeviceContext : public DeviceContext { /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; + /*! \brief Return npu stream in the device context. */ + aclrtStream stream() const; + #ifdef PADDLE_WITH_ASCEND_HCCL /*! \brief Return bkcl context. */ HCCLContext_t hccl_context() const { return hccl_context_; } @@ -194,6 +197,8 @@ class NPUDeviceContext : public DeviceContext { // Eventhough eigen_device_ is not used in NPU // NOTE(zhiqiu): why need? std::unique_ptr eigen_device_; + std::shared_ptr stream_; + DISABLE_COPY_AND_ASSIGN(NPUDeviceContext); }; diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc index c7508f01a1a..4cb5d9325af 100644 --- a/paddle/fluid/platform/npu_info.cc +++ b/paddle/fluid/platform/npu_info.cc @@ -49,6 +49,12 @@ int GetNPUDeviceCount() { return dev_cnt; } +int NPUCanAccessPeer(int src, int dst) { + int can = 0; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst)); + return can; +} + // For example, "1.0.1" std::string GetNPURuntimeVersion(int id) { PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), @@ -167,10 +173,12 @@ size_t NPUMaxChunkSize() { return max_chunk_size; } -void NPUMemcpyASync(void *dst, const void *src, size_t count, +void NPUMemcpyAsync(void *dst, const void *src, size_t count, enum aclrtMemcpyKind kind, aclrtStream stream, size_t dst_max_count) { dst_max_count = dst_max_count ? dst_max_count : count; + VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " " + << kind << " " << stream; PADDLE_ENFORCE_NPU_SUCCESS( aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream)); } @@ -182,6 +190,21 @@ void NPUMemcpySync(void *dst, const void *src, size_t count, PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind)); } +void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src, + size_t count, enum aclrtMemcpyKind kind, + aclrtStream stream, size_t dst_max_count) { + dst_max_count = dst_max_count ? dst_max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream)); +} + +void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count, + enum aclrtMemcpyKind kind, size_t dst_max_count) { + // NOTE(zhiqiu): The default max_count is count + dst_max_count = dst_max_count ? dst_max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind)); +} + void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, size_t max_count) { max_count = max_count ? max_count : count; diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h index beac23dc960..7caada68190 100644 --- a/paddle/fluid/platform/npu_info.h +++ b/paddle/fluid/platform/npu_info.h @@ -31,10 +31,15 @@ int GetNPUDeviceCount(); //! Get the runtime version of the ith NPU std::string GetNPURuntimeVersion(int id); +//! Check if this device can access peer or not. +int NPUCanAccessPeer(int src, int dst); //! Get the current NPU device id in system. int GetCurrentNPUDeviceId(); +//! Get the current NPU stream. +int GetCurrentStream(); + //! Get a list of device ids from environment variable or use all. std::vector GetSelectedNPUDevices(); @@ -79,6 +84,15 @@ void NPUMemcpySync(void *dst, const void *src, size_t count, void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, size_t max_count = 0); +//! Copy memory from one device to another device asynchronously. +void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src, + int src_device, size_t count, aclrtStream stream, + size_t max_count = 0); + +//! Copy memory from one device to another device synchronously. +void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, + int src_device, size_t count, size_t max_count = 0); + //! Blocks until stream has completed all operations. void NPUStreamSync(aclrtStream stream); -- GitLab