未验证 提交 1201cd2e 编写于 作者: L Leo Chen 提交者: GitHub

[feature] support npu allocator, part 2 (#30972)

* support npu allocator

* add npu device context

* fix some compile problem

* fix some compile problem

* add npu info

* compile ok

* fix include dir

* support naive_best_fit_allocator

* run ut ok, bug failed to exit

* call aclrtResetDevice before exit

* fix aclFinilize

* add system allocatot test

* add selected_gpus in gtest

* add tensor_test for npu

* support npu op, initial commit

* add npu stream

* add elementwise_add_op

* compile ok

* fix typo

* fix elementwise_add_op_npu_test

* support op run

* test can run but failed

* change aclopExecuteV2 to aclopCompileAndExecute
上级 7e049108
...@@ -63,16 +63,22 @@ elseif(WITH_ASCEND_CL) ...@@ -63,16 +63,22 @@ elseif(WITH_ASCEND_CL)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so) set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) set(atlas_acl_op_compiler_lib ${ATLAS_ACL_DIR}/libacl_op_compiler.so)
set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}") message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}") message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR}) INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS atlas_acl)
ADD_LIBRARY(atlas_acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl_op_compiler PROPERTY IMPORTED_LOCATION ${atlas_acl_op_compiler_lib})
add_custom_target(extern_ascend DEPENDS atlas_acl atlas_acl_op_compiler)
endif() endif()
...@@ -59,12 +59,14 @@ class AscendInstance { ...@@ -59,12 +59,14 @@ class AscendInstance {
std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() { std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
std::map<AscendString, AscendString> init_options; std::map<AscendString, AscendString> init_options;
//init_options["a"] = "b"; // init_options["a"] = "b";
//init_options["ge.trainFlag"] = "1"; // init_options["ge.trainFlag"] = "1";
return init_options; return init_options;
} }
ge::Status InitGEForUT() { return ge::GEInitialize(_GetDefaultInitOptions()); } ge::Status InitGEForUT() {
return ge::GEInitialize(_GetDefaultInitOptions());
}
void InitGlobalResouces() { void InitGlobalResouces() {
LOG(INFO) << "Begin ascend InitGlobalResouces"; LOG(INFO) << "Begin ascend InitGlobalResouces";
......
...@@ -39,11 +39,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { ...@@ -39,11 +39,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init // init
auto x = scope->Var("X"); auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>(); auto tensor_x = x->GetMutable<f::LoDTensor>();
tensor_x->Resize({10, 10});
auto y = scope->Var("Y"); auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<f::LoDTensor>(); auto tensor_y = y->GetMutable<f::LoDTensor>();
tensor_y->Resize({10, 10});
std::vector<float> init; std::vector<float> init;
for (int64_t i = 0; i < 10 * 10; ++i) { for (int64_t i = 0; i < 10 * 10; ++i) {
...@@ -51,7 +49,11 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { ...@@ -51,7 +49,11 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
} }
TensorFromVector(init, ctx, tensor_x); TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({10, 10});
TensorFromVector(init, ctx, tensor_y); TensorFromVector(init, ctx, tensor_y);
tensor_y->Resize({10, 10});
ctx.Wait();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto out = scope->Var("Out"); auto out = scope->Var("Out");
...@@ -70,6 +72,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { ...@@ -70,6 +72,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
std::vector<float> out_vec; std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec); TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
EXPECT_EQ(out_vec.size(), init.size()); EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) { for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 2.0); EXPECT_EQ(out_vec[i], 2.0);
......
...@@ -14,23 +14,29 @@ limitations under the License. */ ...@@ -14,23 +14,29 @@ limitations under the License. */
#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/npu_op_runner.h"
#include <paddle/fluid/framework/operator.h>
#include <paddle/fluid/framework/data_type.h> #include <paddle/fluid/framework/data_type.h>
#include <paddle/fluid/framework/operator.h>
#include <map> #include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include "acl/acl.h" #include "acl/acl.h"
#include "acl/acl_op_compiler.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static std::map<framework::proto::VarType::Type, aclDataType> DTYPE_2_ACL_DTYPE = { static std::map<framework::proto::VarType::Type, aclDataType>
{framework::proto::VarType::BOOL, ACL_BOOL}, {framework::proto::VarType::INT16, ACL_INT16}, DTYPE_2_ACL_DTYPE = {
{framework::proto::VarType::INT32, ACL_INT32}, {framework::proto::VarType::INT64, ACL_INT64}, {framework::proto::VarType::BOOL, ACL_BOOL},
{framework::proto::VarType::FP16, ACL_FLOAT16}, {framework::proto::VarType::FP32, ACL_FLOAT}, {framework::proto::VarType::INT16, ACL_INT16},
{framework::proto::VarType::INT32, ACL_INT32},
{framework::proto::VarType::INT64, ACL_INT64},
{framework::proto::VarType::FP16, ACL_FLOAT16},
{framework::proto::VarType::FP32, ACL_FLOAT},
{framework::proto::VarType::FP64, ACL_DOUBLE}, {framework::proto::VarType::FP64, ACL_DOUBLE},
}; };
...@@ -58,18 +64,22 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { ...@@ -58,18 +64,22 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
return iter->second; return iter->second;
} }
NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {} NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {
attr_ = aclopCreateAttr();
}
NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs, NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
const std::vector<Tensor> &outputs, const std::vector<Tensor> &outputs,
const AttributeMap &attrs) const AttributeMap &attrs)
: op_type_(op_type) { : op_type_(op_type) {
attr_ = aclopCreateAttr();
AddInputs(inputs); AddInputs(inputs);
AddOutputs(outputs); AddOutputs(outputs);
AddAttrs(attrs); AddAttrs(attrs);
} }
NpuOpRunner::~NpuOpRunner() { NpuOpRunner::~NpuOpRunner() {
//TODO(zhiqiu): handle free // TODO(zhiqiu): handle free
} }
const std::string &NpuOpRunner::Type() { return op_type_; } const std::string &NpuOpRunner::Type() { return op_type_; }
...@@ -84,23 +94,23 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, ...@@ -84,23 +94,23 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr))); aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr)));
} else if (attr.type() == typeid(int64_t)) { } else if (attr.type() == typeid(int64_t)) {
PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrInt( PADDLE_ENFORCE_NPU_SUCCESS(
attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr))); aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
} else if (attr.type() == typeid(float)) { } else if (attr.type() == typeid(float)) {
PADDLE_ENFORCE_NPU_SUCCESS( PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr))); aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr)));
} else if (attr.type() == typeid(std::vector<bool>)) { } else if (attr.type() == typeid(std::vector<bool>)) {
auto a = BOOST_GET_CONST(std::vector<bool>, attr); auto a = BOOST_GET_CONST(std::vector<bool>, attr);
std::vector<uint8_t> cast_a; std::vector<uint8_t> cast_a;
for(auto it : a) { for (auto it : a) {
cast_a.push_back(static_cast<uint8_t>(it)); cast_a.push_back(static_cast<uint8_t>(it));
} }
PADDLE_ENFORCE_NPU_SUCCESS( PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool(
aclopSetAttrListBool(attr_, name.c_str(), cast_a.size(), cast_a.data())); attr_, name.c_str(), cast_a.size(), cast_a.data()));
} else if (attr.type() == typeid(std::vector<int>)) { } else if (attr.type() == typeid(std::vector<int>)) {
auto a = BOOST_GET_CONST(std::vector<int>, attr); auto a = BOOST_GET_CONST(std::vector<int>, attr);
std::vector<int64_t> cast_a; std::vector<int64_t> cast_a;
for(auto it : a) { for (auto it : a) {
cast_a.push_back(static_cast<int64_t>(it)); cast_a.push_back(static_cast<int64_t>(it));
} }
PADDLE_ENFORCE_NPU_SUCCESS( PADDLE_ENFORCE_NPU_SUCCESS(
...@@ -201,15 +211,22 @@ std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() { ...@@ -201,15 +211,22 @@ std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() {
return output_descs_; return output_descs_;
} }
std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() { return input_buffers_; } std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() {
return input_buffers_;
}
std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() { return output_buffers_; } std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
return output_buffers_;
}
aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
auto dtype = ConvertToNpuDtype(tensor.type()); auto dtype = ConvertToNpuDtype(tensor.type());
auto format = ConvertToNpuFormat(tensor.layout()); auto format = ConvertToNpuFormat(tensor.layout());
auto dims = framework::vectorize(tensor.dims()); auto dims = framework::vectorize(tensor.dims());
VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1]
<< " " << format;
auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format); auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
desc, platform::errors::External("Call aclCreateTensorDesc failed.")); desc, platform::errors::External("Call aclCreateTensorDesc failed."));
...@@ -217,18 +234,26 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { ...@@ -217,18 +234,26 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
} }
aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) { aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
auto *buffer = void *ptr = tensor.data<void>();
aclCreateDataBuffer(tensor.Holder()->ptr(), tensor.memory_size()); VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size();
auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
buffer, platform::errors::External("Call aclCreateDataBuffer failed.")); buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
return buffer; return buffer;
} }
void NpuOpRunner::Run(aclrtStream stream) { void NpuOpRunner::Run(aclrtStream stream) {
aclError ret = aclopExecuteV2(op_type_.c_str(), input_descs_.size(), VLOG(4) << "op_type: " << op_type_;
input_descs_.data(), input_buffers_.data(), VLOG(4) << "input_desc.size: " << input_descs_.size();
output_descs_.size(), output_descs_.data(), VLOG(4) << "output_desc.size: " << output_descs_.size();
output_buffers_.data(), attr_, stream); VLOG(4) << "stream: " << stream;
VLOG(4) << "attr: " << attr_;
aclError ret = aclopCompileAndExecute(
op_type_.c_str(), input_descs_.size(), input_descs_.data(),
input_buffers_.data(), output_descs_.size(), output_descs_.data(),
output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
stream);
VLOG(4) << "after aclopCompileAndExecute";
PADDLE_ENFORCE_NPU_SUCCESS(ret); PADDLE_ENFORCE_NPU_SUCCESS(ret);
} }
} // namespace operators } // namespace operators
......
...@@ -32,7 +32,8 @@ using AttributeMap = framework::AttributeMap; ...@@ -32,7 +32,8 @@ using AttributeMap = framework::AttributeMap;
class NpuOpRunner { class NpuOpRunner {
public: public:
explicit NpuOpRunner(std::string op_type); explicit NpuOpRunner(std::string op_type);
explicit NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs = {}, explicit NpuOpRunner(std::string op_type,
const std::vector<Tensor> &inputs = {},
const std::vector<Tensor> &outputs = {}, const std::vector<Tensor> &outputs = {},
const AttributeMap &attrs = {}); const AttributeMap &attrs = {});
...@@ -40,7 +41,7 @@ class NpuOpRunner { ...@@ -40,7 +41,7 @@ class NpuOpRunner {
const std::string &Type(); const std::string &Type();
NpuOpRunner &AddAttr(const std::string& name, const Attribute &attr); NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr);
NpuOpRunner &AddAttrs(const AttributeMap &attrs); NpuOpRunner &AddAttrs(const AttributeMap &attrs);
...@@ -76,7 +77,7 @@ class NpuOpRunner { ...@@ -76,7 +77,7 @@ class NpuOpRunner {
std::vector<aclDataBuffer *> output_buffers_; std::vector<aclDataBuffer *> output_buffers_;
std::vector<aclTensorDesc *> input_descs_; std::vector<aclTensorDesc *> input_descs_;
std::vector<aclTensorDesc *> output_descs_; std::vector<aclTensorDesc *> output_descs_;
aclopAttr *attr_; aclopAttr *attr_{nullptr};
}; };
} // namespace operators } // namespace operators
......
...@@ -72,7 +72,7 @@ if(WITH_ASCEND) ...@@ -72,7 +72,7 @@ if(WITH_ASCEND)
endif() endif()
if(WITH_ASCEND_CL) if(WITH_ASCEND_CL)
cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl) cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl atlas_acl_op_compiler)
endif() endif()
add_subdirectory(dynload) add_subdirectory(dynload)
......
...@@ -83,6 +83,7 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; } ...@@ -83,6 +83,7 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
DeviceContextPool* DeviceContextPool::pool = nullptr; DeviceContextPool* DeviceContextPool::pool = nullptr;
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
VLOG(4) << "DeviceContextPool Get: " << place;
auto it = device_contexts_.find(place); auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) { if (it == device_contexts_.end()) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
...@@ -243,6 +244,7 @@ NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) { ...@@ -243,6 +244,7 @@ NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
// ACL creates a default context which contains 1 default stream // ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice. // and 1 sync strean after aclrtSetDevice.
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_)); PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
stream_.reset(new stream::NPUStream(place));
} }
NPUDeviceContext::~NPUDeviceContext() { NPUDeviceContext::~NPUDeviceContext() {
...@@ -255,6 +257,8 @@ void NPUDeviceContext::Wait() const { ...@@ -255,6 +257,8 @@ void NPUDeviceContext::Wait() const {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
} }
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; } Place NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext* NPUDeviceContext::context() const { aclrtContext* NPUDeviceContext::context() const {
......
...@@ -175,6 +175,9 @@ class NPUDeviceContext : public DeviceContext { ...@@ -175,6 +175,9 @@ class NPUDeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
void Wait() const override; void Wait() const override;
/*! \brief Return npu stream in the device context. */
aclrtStream stream() const;
#ifdef PADDLE_WITH_ASCEND_HCCL #ifdef PADDLE_WITH_ASCEND_HCCL
/*! \brief Return bkcl context. */ /*! \brief Return bkcl context. */
HCCLContext_t hccl_context() const { return hccl_context_; } HCCLContext_t hccl_context() const { return hccl_context_; }
...@@ -194,6 +197,8 @@ class NPUDeviceContext : public DeviceContext { ...@@ -194,6 +197,8 @@ class NPUDeviceContext : public DeviceContext {
// Eventhough eigen_device_ is not used in NPU // Eventhough eigen_device_ is not used in NPU
// NOTE(zhiqiu): why need? // NOTE(zhiqiu): why need?
std::unique_ptr<Eigen::DefaultDevice> eigen_device_; std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
std::shared_ptr<stream::NPUStream> stream_;
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext); DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
}; };
......
...@@ -49,6 +49,12 @@ int GetNPUDeviceCount() { ...@@ -49,6 +49,12 @@ int GetNPUDeviceCount() {
return dev_cnt; return dev_cnt;
} }
int NPUCanAccessPeer(int src, int dst) {
int can = 0;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst));
return can;
}
// For example, "1.0.1" // For example, "1.0.1"
std::string GetNPURuntimeVersion(int id) { std::string GetNPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
...@@ -167,10 +173,12 @@ size_t NPUMaxChunkSize() { ...@@ -167,10 +173,12 @@ size_t NPUMaxChunkSize() {
return max_chunk_size; return max_chunk_size;
} }
void NPUMemcpyASync(void *dst, const void *src, size_t count, void NPUMemcpyAsync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, aclrtStream stream, enum aclrtMemcpyKind kind, aclrtStream stream,
size_t dst_max_count) { size_t dst_max_count) {
dst_max_count = dst_max_count ? dst_max_count : count; dst_max_count = dst_max_count ? dst_max_count : count;
VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
<< kind << " " << stream;
PADDLE_ENFORCE_NPU_SUCCESS( PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream)); aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
} }
...@@ -182,6 +190,21 @@ void NPUMemcpySync(void *dst, const void *src, size_t count, ...@@ -182,6 +190,21 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind)); PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
} }
void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src,
size_t count, enum aclrtMemcpyKind kind,
aclrtStream stream, size_t dst_max_count) {
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
}
void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count,
enum aclrtMemcpyKind kind, size_t dst_max_count) {
// NOTE(zhiqiu): The default max_count is count
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
}
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count) { size_t max_count) {
max_count = max_count ? max_count : count; max_count = max_count ? max_count : count;
......
...@@ -31,10 +31,15 @@ int GetNPUDeviceCount(); ...@@ -31,10 +31,15 @@ int GetNPUDeviceCount();
//! Get the runtime version of the ith NPU //! Get the runtime version of the ith NPU
std::string GetNPURuntimeVersion(int id); std::string GetNPURuntimeVersion(int id);
//! Check if this device can access peer or not.
int NPUCanAccessPeer(int src, int dst);
//! Get the current NPU device id in system. //! Get the current NPU device id in system.
int GetCurrentNPUDeviceId(); int GetCurrentNPUDeviceId();
//! Get the current NPU stream.
int GetCurrentStream();
//! Get a list of device ids from environment variable or use all. //! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedNPUDevices(); std::vector<int> GetSelectedNPUDevices();
...@@ -79,6 +84,15 @@ void NPUMemcpySync(void *dst, const void *src, size_t count, ...@@ -79,6 +84,15 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count = 0); size_t max_count = 0);
//! Copy memory from one device to another device asynchronously.
void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, aclrtStream stream,
size_t max_count = 0);
//! Copy memory from one device to another device synchronously.
void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count, size_t max_count = 0);
//! Blocks until stream has completed all operations. //! Blocks until stream has completed all operations.
void NPUStreamSync(aclrtStream stream); void NPUStreamSync(aclrtStream stream);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册