未验证 提交 1201cd2e 编写于 作者: L Leo Chen 提交者: GitHub

[feature] support npu allocator, part 2 (#30972)

* support npu allocator

* add npu device context

* fix some compile problem

* fix some compile problem

* add npu info

* compile ok

* fix include dir

* support naive_best_fit_allocator

* run ut ok, bug failed to exit

* call aclrtResetDevice before exit

* fix aclFinilize

* add system allocatot test

* add selected_gpus in gtest

* add tensor_test for npu

* support npu op, initial commit

* add npu stream

* add elementwise_add_op

* compile ok

* fix typo

* fix elementwise_add_op_npu_test

* support op run

* test can run but failed

* change aclopExecuteV2 to aclopCompileAndExecute
上级 7e049108
......@@ -63,16 +63,22 @@ elseif(WITH_ASCEND_CL)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
set(atlas_acl_op_compiler_lib ${ATLAS_ACL_DIR}/libacl_op_compiler.so)
set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS atlas_acl)
ADD_LIBRARY(atlas_acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl_op_compiler PROPERTY IMPORTED_LOCATION ${atlas_acl_op_compiler_lib})
add_custom_target(extern_ascend DEPENDS atlas_acl atlas_acl_op_compiler)
endif()
......@@ -59,12 +59,14 @@ class AscendInstance {
std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
std::map<AscendString, AscendString> init_options;
//init_options["a"] = "b";
//init_options["ge.trainFlag"] = "1";
// init_options["a"] = "b";
// init_options["ge.trainFlag"] = "1";
return init_options;
}
ge::Status InitGEForUT() { return ge::GEInitialize(_GetDefaultInitOptions()); }
ge::Status InitGEForUT() {
return ge::GEInitialize(_GetDefaultInitOptions());
}
void InitGlobalResouces() {
LOG(INFO) << "Begin ascend InitGlobalResouces";
......
......@@ -39,11 +39,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
tensor_x->Resize({10, 10});
auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<f::LoDTensor>();
tensor_y->Resize({10, 10});
std::vector<float> init;
for (int64_t i = 0; i < 10 * 10; ++i) {
......@@ -51,7 +49,11 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
}
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({10, 10});
TensorFromVector(init, ctx, tensor_y);
tensor_y->Resize({10, 10});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
......@@ -70,6 +72,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 2.0);
......
......@@ -14,24 +14,30 @@ limitations under the License. */
#include "paddle/fluid/operators/npu_op_runner.h"
#include <paddle/fluid/framework/operator.h>
#include <paddle/fluid/framework/data_type.h>
#include <paddle/fluid/framework/operator.h>
#include <map>
#include <string>
#include <vector>
#include "acl/acl.h"
#include "acl/acl_op_compiler.h"
#include "paddle/fluid/framework/framework.pb.h"
namespace paddle {
namespace operators {
static std::map<framework::proto::VarType::Type, aclDataType> DTYPE_2_ACL_DTYPE = {
{framework::proto::VarType::BOOL, ACL_BOOL}, {framework::proto::VarType::INT16, ACL_INT16},
{framework::proto::VarType::INT32, ACL_INT32}, {framework::proto::VarType::INT64, ACL_INT64},
{framework::proto::VarType::FP16, ACL_FLOAT16}, {framework::proto::VarType::FP32, ACL_FLOAT},
{framework::proto::VarType::FP64, ACL_DOUBLE},
static std::map<framework::proto::VarType::Type, aclDataType>
DTYPE_2_ACL_DTYPE = {
{framework::proto::VarType::BOOL, ACL_BOOL},
{framework::proto::VarType::INT16, ACL_INT16},
{framework::proto::VarType::INT32, ACL_INT32},
{framework::proto::VarType::INT64, ACL_INT64},
{framework::proto::VarType::FP16, ACL_FLOAT16},
{framework::proto::VarType::FP32, ACL_FLOAT},
{framework::proto::VarType::FP64, ACL_DOUBLE},
};
static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
......@@ -58,18 +64,22 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
return iter->second;
}
NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {}
NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {
attr_ = aclopCreateAttr();
}
NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
const std::vector<Tensor> &outputs,
const AttributeMap &attrs)
: op_type_(op_type) {
attr_ = aclopCreateAttr();
AddInputs(inputs);
AddOutputs(outputs);
AddAttrs(attrs);
}
NpuOpRunner::~NpuOpRunner() {
//TODO(zhiqiu): handle free
// TODO(zhiqiu): handle free
}
const std::string &NpuOpRunner::Type() { return op_type_; }
......@@ -84,23 +94,23 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr)));
} else if (attr.type() == typeid(int64_t)) {
PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrInt(
attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
} else if (attr.type() == typeid(float)) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr)));
} else if (attr.type() == typeid(std::vector<bool>)) {
auto a = BOOST_GET_CONST(std::vector<bool>, attr);
std::vector<uint8_t> cast_a;
for(auto it : a) {
for (auto it : a) {
cast_a.push_back(static_cast<uint8_t>(it));
}
PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrListBool(attr_, name.c_str(), cast_a.size(), cast_a.data()));
PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool(
attr_, name.c_str(), cast_a.size(), cast_a.data()));
} else if (attr.type() == typeid(std::vector<int>)) {
auto a = BOOST_GET_CONST(std::vector<int>, attr);
std::vector<int64_t> cast_a;
for(auto it : a) {
for (auto it : a) {
cast_a.push_back(static_cast<int64_t>(it));
}
PADDLE_ENFORCE_NPU_SUCCESS(
......@@ -201,15 +211,22 @@ std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() {
return output_descs_;
}
std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() { return input_buffers_; }
std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() {
return input_buffers_;
}
std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() { return output_buffers_; }
std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
return output_buffers_;
}
aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
auto dtype = ConvertToNpuDtype(tensor.type());
auto format = ConvertToNpuFormat(tensor.layout());
auto dims = framework::vectorize(tensor.dims());
VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1]
<< " " << format;
auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format);
PADDLE_ENFORCE_NOT_NULL(
desc, platform::errors::External("Call aclCreateTensorDesc failed."));
......@@ -217,18 +234,26 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
}
aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
auto *buffer =
aclCreateDataBuffer(tensor.Holder()->ptr(), tensor.memory_size());
void *ptr = tensor.data<void>();
VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size();
auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
PADDLE_ENFORCE_NOT_NULL(
buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
return buffer;
}
void NpuOpRunner::Run(aclrtStream stream) {
aclError ret = aclopExecuteV2(op_type_.c_str(), input_descs_.size(),
input_descs_.data(), input_buffers_.data(),
output_descs_.size(), output_descs_.data(),
output_buffers_.data(), attr_, stream);
VLOG(4) << "op_type: " << op_type_;
VLOG(4) << "input_desc.size: " << input_descs_.size();
VLOG(4) << "output_desc.size: " << output_descs_.size();
VLOG(4) << "stream: " << stream;
VLOG(4) << "attr: " << attr_;
aclError ret = aclopCompileAndExecute(
op_type_.c_str(), input_descs_.size(), input_descs_.data(),
input_buffers_.data(), output_descs_.size(), output_descs_.data(),
output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
stream);
VLOG(4) << "after aclopCompileAndExecute";
PADDLE_ENFORCE_NPU_SUCCESS(ret);
}
} // namespace operators
......
......@@ -32,7 +32,8 @@ using AttributeMap = framework::AttributeMap;
class NpuOpRunner {
public:
explicit NpuOpRunner(std::string op_type);
explicit NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs = {},
explicit NpuOpRunner(std::string op_type,
const std::vector<Tensor> &inputs = {},
const std::vector<Tensor> &outputs = {},
const AttributeMap &attrs = {});
......@@ -40,7 +41,7 @@ class NpuOpRunner {
const std::string &Type();
NpuOpRunner &AddAttr(const std::string& name, const Attribute &attr);
NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr);
NpuOpRunner &AddAttrs(const AttributeMap &attrs);
......@@ -76,7 +77,7 @@ class NpuOpRunner {
std::vector<aclDataBuffer *> output_buffers_;
std::vector<aclTensorDesc *> input_descs_;
std::vector<aclTensorDesc *> output_descs_;
aclopAttr *attr_;
aclopAttr *attr_{nullptr};
};
} // namespace operators
......
......@@ -72,7 +72,7 @@ if(WITH_ASCEND)
endif()
if(WITH_ASCEND_CL)
cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl)
cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl atlas_acl_op_compiler)
endif()
add_subdirectory(dynload)
......
......@@ -83,6 +83,7 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
DeviceContextPool* DeviceContextPool::pool = nullptr;
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
VLOG(4) << "DeviceContextPool Get: " << place;
auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) {
PADDLE_THROW(platform::errors::Unimplemented(
......@@ -243,6 +244,7 @@ NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
// ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice.
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
stream_.reset(new stream::NPUStream(place));
}
NPUDeviceContext::~NPUDeviceContext() {
......@@ -255,6 +257,8 @@ void NPUDeviceContext::Wait() const {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext* NPUDeviceContext::context() const {
......
......@@ -175,6 +175,9 @@ class NPUDeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Return npu stream in the device context. */
aclrtStream stream() const;
#ifdef PADDLE_WITH_ASCEND_HCCL
/*! \brief Return bkcl context. */
HCCLContext_t hccl_context() const { return hccl_context_; }
......@@ -194,6 +197,8 @@ class NPUDeviceContext : public DeviceContext {
// Eventhough eigen_device_ is not used in NPU
// NOTE(zhiqiu): why need?
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
std::shared_ptr<stream::NPUStream> stream_;
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
};
......
......@@ -49,6 +49,12 @@ int GetNPUDeviceCount() {
return dev_cnt;
}
int NPUCanAccessPeer(int src, int dst) {
int can = 0;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst));
return can;
}
// For example, "1.0.1"
std::string GetNPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
......@@ -167,10 +173,12 @@ size_t NPUMaxChunkSize() {
return max_chunk_size;
}
void NPUMemcpyASync(void *dst, const void *src, size_t count,
void NPUMemcpyAsync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, aclrtStream stream,
size_t dst_max_count) {
dst_max_count = dst_max_count ? dst_max_count : count;
VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
<< kind << " " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
}
......@@ -182,6 +190,21 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
}
void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src,
size_t count, enum aclrtMemcpyKind kind,
aclrtStream stream, size_t dst_max_count) {
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
}
void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count,
enum aclrtMemcpyKind kind, size_t dst_max_count) {
// NOTE(zhiqiu): The default max_count is count
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
}
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count) {
max_count = max_count ? max_count : count;
......
......@@ -31,10 +31,15 @@ int GetNPUDeviceCount();
//! Get the runtime version of the ith NPU
std::string GetNPURuntimeVersion(int id);
//! Check if this device can access peer or not.
int NPUCanAccessPeer(int src, int dst);
//! Get the current NPU device id in system.
int GetCurrentNPUDeviceId();
//! Get the current NPU stream.
int GetCurrentStream();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedNPUDevices();
......@@ -79,6 +84,15 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count = 0);
//! Copy memory from one device to another device asynchronously.
void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, aclrtStream stream,
size_t max_count = 0);
//! Copy memory from one device to another device synchronously.
void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count, size_t max_count = 0);
//! Blocks until stream has completed all operations.
void NPUStreamSync(aclrtStream stream);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册