[feature] support npu allocator, part 2 (#30972)

* support npu allocator * add npu device context * fix some compile problem * fix some compile problem * add npu info * compile ok * fix include dir * support naive_best_fit_allocator * run ut ok, bug failed to exit * call aclrtResetDevice before exit * fix aclFinilize * add system allocatot test * add selected_gpus in gtest * add tensor_test for npu * support npu op, initial commit * add npu stream * add elementwise_add_op * compile ok * fix typo * fix elementwise_add_op_npu_test * support op run * test can run but failed * change aclopExecuteV2 to aclopCompileAndExecute

[feature] support npu allocator, part 2 (#30972)
* support npu allocator * add npu device context * fix some compile problem * fix some compile problem * add npu info * compile ok * fix include dir * support naive_best_fit_allocator * run ut ok, bug failed to exit * call aclrtResetDevice before exit * fix aclFinilize * add system allocatot test * add selected_gpus in gtest * add tensor_test for npu * support npu op, initial commit * add npu stream * add elementwise_add_op * compile ok * fix typo * fix elementwise_add_op_npu_test * support op run * test can run but failed * change aclopExecuteV2 to aclopCompileAndExecute
1201cd2e · Leo Chen · GitHub · 7e049108 · 1201cd2e · 1201cd2e
10 changed file
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -63,16 +63,22 @@ elseif(WITH_ASCEND_CL)
  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
-  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
  set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
-  set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
+  set(atlas_acl_op_compiler_lib ${ATLAS_ACL_DIR}/libacl_op_compiler.so)
+  set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
  message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
  message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
  INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
-  add_custom_target(extern_ascend DEPENDS atlas_acl)
+  ADD_LIBRARY(atlas_acl_op_compiler SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl_op_compiler PROPERTY IMPORTED_LOCATION ${atlas_acl_op_compiler_lib})
+  add_custom_target(extern_ascend DEPENDS atlas_acl atlas_acl_op_compiler)
 endif()
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -59,12 +59,14 @@ class AscendInstance {
  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
    std::map<AscendString, AscendString> init_options;
-    //init_options["a"] = "b";
+    // init_options["a"] = "b";
-    //init_options["ge.trainFlag"] = "1";
+    // init_options["ge.trainFlag"] = "1";
    return init_options;
  }
-  ge::Status InitGEForUT() { return ge::GEInitialize(_GetDefaultInitOptions()); }
+  ge::Status InitGEForUT() {
+    return ge::GEInitialize(_GetDefaultInitOptions());
+  }
  void InitGlobalResouces() {
    LOG(INFO) << "Begin ascend InitGlobalResouces";

--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu_test.cc
@@ -39,11 +39,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
  auto x = scope->Var("X");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
-  tensor_x->Resize({10, 10});
  auto y = scope->Var("Y");
  auto tensor_y = y->GetMutable<f::LoDTensor>();
-  tensor_y->Resize({10, 10});
  std::vector<float> init;
  for (int64_t i = 0; i < 10 * 10; ++i) {
@@ -51,7 +49,11 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  }
  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
  TensorFromVector(init, ctx, tensor_y);
+  tensor_y->Resize({10, 10});
+  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("Out");
@@ -70,6 +72,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  std::vector<float> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
  EXPECT_EQ(out_vec.size(), init.size());
  for (uint32_t i = 0; i < out_vec.size(); i++) {
    EXPECT_EQ(out_vec[i], 2.0);

--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -14,23 +14,29 @@ limitations under the License. */
 #include "paddle/fluid/operators/npu_op_runner.h"
-#include <paddle/fluid/framework/operator.h>
 #include <paddle/fluid/framework/data_type.h>
+#include <paddle/fluid/framework/operator.h>
 #include <map>
 #include <string>
 #include <vector>
 #include "acl/acl.h"
+#include "acl/acl_op_compiler.h"
 #include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace operators {
-static std::map<framework::proto::VarType::Type, aclDataType> DTYPE_2_ACL_DTYPE = {
+static std::map<framework::proto::VarType::Type, aclDataType>
-    {framework::proto::VarType::BOOL, ACL_BOOL},    {framework::proto::VarType::INT16, ACL_INT16},
+    DTYPE_2_ACL_DTYPE = {
-    {framework::proto::VarType::INT32, ACL_INT32},  {framework::proto::VarType::INT64, ACL_INT64},
+        {framework::proto::VarType::BOOL, ACL_BOOL},
-    {framework::proto::VarType::FP16, ACL_FLOAT16}, {framework::proto::VarType::FP32, ACL_FLOAT},
+        {framework::proto::VarType::INT16, ACL_INT16},
+        {framework::proto::VarType::INT32, ACL_INT32},
+        {framework::proto::VarType::INT64, ACL_INT64},
+        {framework::proto::VarType::FP16, ACL_FLOAT16},
+        {framework::proto::VarType::FP32, ACL_FLOAT},
        {framework::proto::VarType::FP64, ACL_DOUBLE},
 };
@@ -58,18 +64,22 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
  return iter->second;
 }
-NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {}
+NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {
+  attr_ = aclopCreateAttr();
+}
 NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
                         const std::vector<Tensor> &outputs,
                         const AttributeMap &attrs)
    : op_type_(op_type) {
+  attr_ = aclopCreateAttr();
  AddInputs(inputs);
  AddOutputs(outputs);
  AddAttrs(attrs);
 }
 NpuOpRunner::~NpuOpRunner() {
-  //TODO(zhiqiu): handle free
+  // TODO(zhiqiu): handle free
 }
 const std::string &NpuOpRunner::Type() { return op_type_; }
@@ -84,23 +94,23 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
        aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr)));
  } else if (attr.type() == typeid(int64_t)) {
-    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrInt(
+    PADDLE_ENFORCE_NPU_SUCCESS(
-        attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
+        aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
  } else if (attr.type() == typeid(float)) {
    PADDLE_ENFORCE_NPU_SUCCESS(
        aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr)));
  } else if (attr.type() == typeid(std::vector<bool>)) {
    auto a = BOOST_GET_CONST(std::vector<bool>, attr);
    std::vector<uint8_t> cast_a;
-    for(auto it : a) {
+    for (auto it : a) {
      cast_a.push_back(static_cast<uint8_t>(it));
    }
-    PADDLE_ENFORCE_NPU_SUCCESS(
+    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool(
-        aclopSetAttrListBool(attr_, name.c_str(), cast_a.size(), cast_a.data()));
+        attr_, name.c_str(), cast_a.size(), cast_a.data()));
  } else if (attr.type() == typeid(std::vector<int>)) {
    auto a = BOOST_GET_CONST(std::vector<int>, attr);
    std::vector<int64_t> cast_a;
-    for(auto it : a) {
+    for (auto it : a) {
      cast_a.push_back(static_cast<int64_t>(it));
    }
    PADDLE_ENFORCE_NPU_SUCCESS(
@@ -201,15 +211,22 @@ std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() {
  return output_descs_;
 }
-std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() { return input_buffers_; }
+std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() {
+  return input_buffers_;
+}
-std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() { return output_buffers_; }
+std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
+  return output_buffers_;
+}
 aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
  auto dtype = ConvertToNpuDtype(tensor.type());
  auto format = ConvertToNpuFormat(tensor.layout());
  auto dims = framework::vectorize(tensor.dims());
+  VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1]
+          << " " << format;
  auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format);
  PADDLE_ENFORCE_NOT_NULL(
      desc, platform::errors::External("Call aclCreateTensorDesc failed."));
@@ -217,18 +234,26 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
 }
 aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
-  auto *buffer =
+  void *ptr = tensor.data<void>();
-      aclCreateDataBuffer(tensor.Holder()->ptr(), tensor.memory_size());
+  VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size();
+  auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
  PADDLE_ENFORCE_NOT_NULL(
      buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
  return buffer;
 }
 void NpuOpRunner::Run(aclrtStream stream) {
-  aclError ret = aclopExecuteV2(op_type_.c_str(), input_descs_.size(),
+  VLOG(4) << "op_type: " << op_type_;
-                                input_descs_.data(), input_buffers_.data(),
+  VLOG(4) << "input_desc.size: " << input_descs_.size();
-                                output_descs_.size(), output_descs_.data(),
+  VLOG(4) << "output_desc.size: " << output_descs_.size();
-                                output_buffers_.data(), attr_, stream);
+  VLOG(4) << "stream: " << stream;
+  VLOG(4) << "attr: " << attr_;
+  aclError ret = aclopCompileAndExecute(
+      op_type_.c_str(), input_descs_.size(), input_descs_.data(),
+      input_buffers_.data(), output_descs_.size(), output_descs_.data(),
+      output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
+      stream);
+  VLOG(4) << "after aclopCompileAndExecute";
  PADDLE_ENFORCE_NPU_SUCCESS(ret);
 }
 }  // namespace operators

--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -32,7 +32,8 @@ using AttributeMap = framework::AttributeMap;
 class NpuOpRunner {
 public:
  explicit NpuOpRunner(std::string op_type);
-  explicit NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs = {},
+  explicit NpuOpRunner(std::string op_type,
+                       const std::vector<Tensor> &inputs = {},
                       const std::vector<Tensor> &outputs = {},
                       const AttributeMap &attrs = {});
@@ -40,7 +41,7 @@ class NpuOpRunner {
  const std::string &Type();
-  NpuOpRunner &AddAttr(const std::string& name, const Attribute &attr);
+  NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr);
  NpuOpRunner &AddAttrs(const AttributeMap &attrs);
@@ -76,7 +77,7 @@ class NpuOpRunner {
  std::vector<aclDataBuffer *> output_buffers_;
  std::vector<aclTensorDesc *> input_descs_;
  std::vector<aclTensorDesc *> output_descs_;
-  aclopAttr *attr_;
+  aclopAttr *attr_{nullptr};
 };
 }  // namespace operators

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -72,7 +72,7 @@ if(WITH_ASCEND)
 endif()
 if(WITH_ASCEND_CL)
-    cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl)
+    cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl atlas_acl_op_compiler)
 endif()
 add_subdirectory(dynload)

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -83,6 +83,7 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
+  VLOG(4) << "DeviceContextPool Get: " << place;
  auto it = device_contexts_.find(place);
  if (it == device_contexts_.end()) {
    PADDLE_THROW(platform::errors::Unimplemented(
@@ -243,6 +244,7 @@ NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
  // ACL creates a default context which contains 1 default stream
  // and 1 sync strean after aclrtSetDevice.
  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
+  stream_.reset(new stream::NPUStream(place));
 }
 NPUDeviceContext::~NPUDeviceContext() {
@@ -255,6 +257,8 @@ void NPUDeviceContext::Wait() const {
  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
 }
+aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
 Place NPUDeviceContext::GetPlace() const { return place_; }
 aclrtContext* NPUDeviceContext::context() const {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -175,6 +175,9 @@ class NPUDeviceContext : public DeviceContext {
  /*! \brief  Wait for all operations completion in the stream. */
  void Wait() const override;
+  /*! \brief  Return npu stream in the device context. */
+  aclrtStream stream() const;
 #ifdef PADDLE_WITH_ASCEND_HCCL
  /*! \brief  Return bkcl context. */
  HCCLContext_t hccl_context() const { return hccl_context_; }
@@ -194,6 +197,8 @@ class NPUDeviceContext : public DeviceContext {
  // Eventhough eigen_device_ is not used in NPU
  // NOTE(zhiqiu): why need?
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::shared_ptr<stream::NPUStream> stream_;
  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
 };

--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
@@ -49,6 +49,12 @@ int GetNPUDeviceCount() {
  return dev_cnt;
 }
+int NPUCanAccessPeer(int src, int dst) {
+  int can = 0;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst));
+  return can;
+}
 // For example, "1.0.1"
 std::string GetNPURuntimeVersion(int id) {
  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
@@ -167,10 +173,12 @@ size_t NPUMaxChunkSize() {
  return max_chunk_size;
 }
-void NPUMemcpyASync(void *dst, const void *src, size_t count,
+void NPUMemcpyAsync(void *dst, const void *src, size_t count,
                    enum aclrtMemcpyKind kind, aclrtStream stream,
                    size_t dst_max_count) {
  dst_max_count = dst_max_count ? dst_max_count : count;
+  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
+          << kind << " " << stream;
  PADDLE_ENFORCE_NPU_SUCCESS(
      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
 }
@@ -182,6 +190,21 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
 }
+void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src,
+                        size_t count, enum aclrtMemcpyKind kind,
+                        aclrtStream stream, size_t dst_max_count) {
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
+}
+void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count,
+                       enum aclrtMemcpyKind kind, size_t dst_max_count) {
+  // NOTE(zhiqiu):  The default max_count is count
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
+}
 void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
                    size_t max_count) {
  max_count = max_count ? max_count : count;

--- a/paddle/fluid/platform/npu_info.h
+++ b/paddle/fluid/platform/npu_info.h
@@ -31,10 +31,15 @@ int GetNPUDeviceCount();
 //! Get the runtime version of the ith NPU
 std::string GetNPURuntimeVersion(int id);
+//! Check if this device can access peer or not.
+int NPUCanAccessPeer(int src, int dst);
 //! Get the current NPU device id in system.
 int GetCurrentNPUDeviceId();
+//! Get the current NPU stream.
+int GetCurrentStream();
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedNPUDevices();
@@ -79,6 +84,15 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
 void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
                    size_t max_count = 0);
+//! Copy memory from one device to another device asynchronously.
+void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, aclrtStream stream,
+                        size_t max_count = 0);
+//! Copy memory from one device to another device synchronously.
+void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count, size_t max_count = 0);
 //! Blocks until stream has completed all operations.
 void NPUStreamSync(aclrtStream stream);