!444 gpu dynamic memory pool suppoerts multi-graph

Merge pull request !444 from limingqi107/master

!444 gpu dynamic memory pool suppoerts multi-graph
Merge pull request !444 from limingqi107/master
43cfd16e · mindspore-ci-bot · Gitee · 25b0a8c3 · 5e01b94c · 43cfd16e
Showing with 23 addition and 7 deletion

mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc +19 -5

mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h +4 -2

未找到文件。
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
@@ -155,7 +155,8 @@ void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
  mem_reuse_util_ptr->SetReuseRefCount();
  // Can't free the device address of graph output, so set the reference count of graph output specially.
  mem_reuse_util_ptr->SetGraphOutputRefCount();
-  mem_reuse_util_ptr_ = mem_reuse_util_ptr;
+  auto graph_id = graph->graph_id();
+  mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
 }

 void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
@@ -179,6 +180,7 @@ void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph

 bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
+  auto graph_id = graph->graph_id();
  // The inputs and outputs memory of communication kernel are special, so separate processing.
  AllocCommunicationOpDynamicRes(graph);

@@ -194,7 +196,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
      MS_LOG(ERROR) << "Launch kernel failed.";
      return false;
    }
-    FreeKernelDynamicRes(kernel, kernel_workspaces);
+    FreeKernelDynamicRes(kernel, kernel_workspaces, graph_id);
  }

  if (!SyncStream()) {
@@ -341,14 +343,16 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
 }

 void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
-                                            const AddressPtrList &kernel_workspaces) {
+                                            const AddressPtrList &kernel_workspaces, uint32_t graph_id) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
+  auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  auto cnode = kernel->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  // Free the input of kernel by reference count.
  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    auto kernel_ref_count_ptr = mem_reuse_util_ptr_->GetKernelInputRef(cnode, i);
+    auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetKernelInputRef(cnode, i);
    if (kernel_ref_count_ptr == nullptr) {
      continue;
    }
@@ -361,7 +365,6 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
      // Reset the reference count.
      kernel_ref_count_ptr->ref_count_dynamic_use_ = kernel_ref_count_ptr->ref_count_;
      bool is_communication_op = false;
-      // The inputs and outputs memory of communication kernel are special, so separate processing.
      FreeCommunicationOpDynamicRes(kernel, i, &is_communication_op);
      if (!is_communication_op) {
        auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
@@ -369,6 +372,17 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
      }
    }
  }
+  // Free the output of kernel, if output has no reference.
+  for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
+    auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetRef(cnode, i);
+    if (kernel_ref_count_ptr == nullptr) {
+      continue;
+    }
+    if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
+      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
+      mem_manager_->FreeMemFromMemPool(device_address);
+    }
+  }
  // Free the workspace of kernel.
  for (size_t i = 0; i < kernel_workspaces.size(); ++i) {
    auto workspace = kernel_workspaces[i];

--- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
@@ -21,6 +21,7 @@
 #include <memory>
 #include <vector>
 #include <utility>
+#include <unordered_map>
 #include "device/kernel_runtime.h"
 #include "device/kernel_runtime_manager.h"

@@ -57,11 +58,12 @@ class GPUKernelRuntime : public KernelRuntime {
  void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph);
  void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel);
  void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel);
-  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, const AddressPtrList &kernel_workspaces);
+  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, const AddressPtrList &kernel_workspaces,
+                            uint32_t graph_id);
  void FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, bool *is_communication_op);
  size_t communication_op_input_ref_count_{0};
  size_t communication_op_output_ref_count_{0};
-  MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
+  std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
 };
 MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
 }  // namespace gpu