Use the unified Execute function to run Graph or Single Op Graph.

c0070d3d · Zhang Qinghua · 77dd91a6 · c0070d3d · c0070d3d · c0070d3d
12 changed file
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -318,7 +318,7 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
 #endif
  {
    // run task on device
-    Execute(kernel_graph);
+    Execute(kernel_graph, true);
  }
  // summary
  Summary(kernel_graph.get());
@@ -348,17 +348,6 @@ void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelG
  MS_LOG(INFO) << "Finish";
 }

-void AscendSession::RunOpExecTask(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  MS_LOG(INFO) << "Start!";
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  bool ret_ok = runtime_instance->LaunchKernel(kernel_graph.get());
-  if (!ret_ok) {
-    MS_LOG(EXCEPTION) << "Run task error!";
-  }
-  MS_LOG(INFO) << "Finish!";
-}
-
 bool AscendSession::GraphCacheExist(const GraphInfo &graph_info) const {
  return run_op_graphs_.find(graph_info) != run_op_graphs_.end();
 }
@@ -398,7 +387,7 @@ void AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_i
  // load input data to device
  LoadInputData(graph, input_tensors);
  // run op
-  RunOpExecTask(graph);
+  Execute(graph, false);
  // get output
  if (op_run_info.value != nullptr) {
    std::vector<tensor::TensorPtr> pre_output_tensors;
@@ -552,21 +541,30 @@ void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const {

 void AscendSession::Load(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  MS_LOG(INFO) << "Start!";
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
  (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-  bool ret_ok = runtime_instance->Load(kernel_graph.get());
+  bool ret_ok = runtime_instance->Load(kernel_graph.get(), is_task_sink);
  if (!ret_ok) {
    MS_LOG(EXCEPTION) << "Load task error!";
  }
  MS_LOG(INFO) << "Finish!";
 }

-void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const {
  MS_LOG(INFO) << "Start!";
+  bool is_task_sink = false;
+  if (is_task) {
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
+  }
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-  bool ret_ok = runtime_instance->Run(kernel_graph.get());
+  bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink);
  if (!ret_ok) {
    MS_LOG(EXCEPTION) << "run task error!";
  }

--- a/mindspore/ccsrc/backend/session/ascend_session.h
+++ b/mindspore/ccsrc/backend/session/ascend_session.h
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 #ifndef MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
 #define MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
+
 #include <unordered_map>
 #include <string>
 #include <memory>
@@ -82,13 +84,12 @@ class AscendSession : public SessionBasic {
                        KernelGraph *kernel_graph) const;
  void RunOpMemoryClear(const KernelGraph *kernel_graph) const;
  void Load(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-  void Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+  void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
  void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
  void DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs);
  void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
  // below functions are used for run op
  void RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const;
-  void RunOpExecTask(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  static void BackendOptimization(const std::vector<KernelGraphPtr> &all_graphs);
  static void LinkChildGraphs(NotNull<KernelGraphPtr> graph);

--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -118,7 +118,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
    debugger_->PreExecute(kernel_graph);
  }
 #endif
-  bool ret = runtime_.Run(kernel_graph.get());
+  bool ret = runtime_.Run(kernel_graph.get(), false);
  if (!ret) {
    MS_LOG(EXCEPTION) << "Run graph failed";
  }

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -191,9 +191,9 @@ void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
 #ifdef ENABLE_DEBUGGER
-  if (!runtime_instance->Run(kernel_graph.get(), debugger_.get())) {
+  if (!runtime_instance->Run(kernel_graph.get(), false, debugger_.get())) {
 #else
-  if (!runtime_instance->Run(kernel_graph.get())) {
+  if (!runtime_instance->Run(kernel_graph.get(), false)) {
 #endif
    MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  }

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -454,10 +454,7 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size
  return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
 }

-bool AscendKernelRuntime::Load(session::KernelGraph *graph) {
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
+bool AscendKernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) {
  if (!is_task_sink) {
    return true;
  }
@@ -609,17 +606,14 @@ void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) {
  }
 }

-bool AscendKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
+bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) {
  bool ret = false;
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
 #if defined(_WIN32) || defined(_WIN64)
  auto start_time = std::chrono::steady_clock::now();
 #else
  struct timeval start_time, end_time;
  (void)gettimeofday(&start_time, nullptr);
 #endif
-  bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
  if (is_task_sink) {
    ret = RunTask(graph);
  } else {

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -44,8 +44,8 @@ class AscendKernelRuntime : public KernelRuntime {
  bool GenTask(const session::KernelGraph *graph);
  bool LoadTask(const session::KernelGraph *graph);
  bool RunTask(const session::KernelGraph *graph);
-  bool Load(session::KernelGraph *graph) override;
-  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+  bool Load(session::KernelGraph *graph, bool is_task_sink) override;
+  bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override;
  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
                                 const std::unordered_set<ValueNodePtr> &value_nodes,
                                 const std::vector<CNodePtr> &execution_order) override;

--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
@@ -287,7 +287,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput
  resource_manager_.DecreaseSummaryRefCount(summary_outputs);
 }

-bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, Debugger *debugger) {
+bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink, Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  resource_manager_.IncreaseAddressRefCount(kernel_graph);


--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime {
  ~CPUKernelRuntime() override = default;

  bool Init() override { return true; }
-  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+  bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override;
  void AssignKernelAddress(session::KernelGraph *kernel_graph);
  void BindInputOutput(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                       VectorRef *outputs);

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "runtime/device/gpu/gpu_kernel_runtime.h"
-#include <algorithm>
-#include "runtime/device/gpu/gpu_device_address.h"
-#include "runtime/device/gpu/cuda_driver.h"
-#include "runtime/device/gpu/gpu_buffer_mgr.h"
-#include "runtime/device/gpu/gpu_device_manager.h"
-#include "runtime/device/gpu/gpu_memory_allocator.h"
-#include "runtime/device/gpu/distribution/collective_init.h"
-#include "utils/convert_utils.h"
-#include "utils/ms_context.h"
-#include "runtime/device/kernel_runtime_manager.h"
-#include "runtime/device/gpu/gpu_common.h"
-#include "utils/ms_utils.h"
-#include "runtime/device/gpu/gpu_memory_manager.h"
-#include "backend/kernel_compiler/common_utils.h"
-#include "runtime/device/gpu/gpu_memory_copy_manager.h"
-#include "common/trans.h"
-#include "ir/dtype.h"
-#include "profiler/device/gpu/gpu_profiling.h"
-#include "utils/shape_utils.h"
-#ifdef ENABLE_DEBUGGER
-#include "debug/debug_services.h"
-#endif
-
-namespace mindspore {
-namespace device {
-namespace gpu {
-using mindspore::device::memswap::MemSwapInfoSet;
-using mindspore::device::memswap::MemSwapManager;
-using mindspore::device::memswap::SwapKind;
-static const size_t PARAMETER_OUTPUT_INDEX = 0;
-bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
-
-bool GPUKernelRuntime::Init() {
-  if (device_init_ == true) {
-    GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory();
-    return true;
-  }
-  bool ret = false;
-#ifdef ENABLE_DUMP_E2E
-  ret = SetDumpConf();
-  if (!ret) {
-    MS_LOG(INFO) << "No dump conf to set!";
-  }
-#endif
-
-  ret = InitDevice();
-  if (!ret) {
-    MS_LOG(ERROR) << "InitDevice error.";
-    return ret;
-  }
-  mem_manager_ = std::make_shared<GPUMemoryManager>();
-  MS_EXCEPTION_IF_NULL(mem_manager_);
-  mem_manager_->MallocDeviceMemory();
-  const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
-  bool collective_inited = CollectiveInitializer::instance().collective_inited();
-  if (collective_inited && collective_handle_ != nullptr) {
-    auto init_nccl_comm_funcptr =
-      reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
-    MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
-    (*init_nccl_comm_funcptr)();
-  }
-  device_init_ = true;
-  return ret;
-}
-
-#ifdef ENABLE_DUMP_E2E
-namespace {
-void DumpOutput(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
-                Debugger *debugger) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(dump_conf);
-  bool trans_flag = dump_conf->trans_flag();
-  const auto &apply_kernels = graph->execution_order();
-  for (const auto &node : apply_kernels) {
-    MS_EXCEPTION_IF_NULL(node);
-    auto node_name = AnfAlgo::GetCNodeName(node);
-    std::string kernel_name = node->fullname_with_scope();
-    if (!dump_conf->IsKernelNeedDump(kernel_name)) {
-      continue;
-    }
-    const std::string strsrc = "/";
-    const std::string strdst = "--";
-    std::string::size_type pos = 0;
-    std::string::size_type srclen = strsrc.size();
-    std::string::size_type dstlen = strdst.size();
-    while ((pos = kernel_name.find(strsrc, pos)) != std::string::npos) {
-      kernel_name.replace(pos, srclen, strdst);
-      pos += dstlen;
-    }
-    auto output_size = AnfAlgo::GetOutputTensorNum(node);
-    for (size_t j = 0; j < output_size; ++j) {
-      auto addr = AnfAlgo::GetOutputAddr(node, j);
-      TypeId addr_type_id = addr->type_id();
-      std::string addr_format = addr->format();
-      ShapeVector int_shapes;
-      if (trans_flag) {
-        int_shapes = trans::GetRuntimePaddingShape(node, j);
-      } else {
-        auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
-        (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                             [](size_t inner_item) { return SizeToInt(inner_item); });
-      }
-
-      auto type = AnfAlgo::GetOutputInferDataType(node, j);
-
-      auto format = kOpFormat_DEFAULT;
-      string filepath = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j);
-
-      DebugServices *debug_services = debugger->debug_services();
-      TensorLoader *tensor_loader = debug_services->tensor_loader();
-      std::string original_kernel_name = node->fullname_with_scope();
-      size_t slot = j;
-      auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
-                                                 addr_type_id, addr_format, slot);
-
-      if (!ret) {
-        std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
-                            ", host_format:" + format + ".!";
-      }
-    }
-  }
-}
-
-void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
-                    Debugger *debugger) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(dump_conf);
-  bool trans_flag = dump_conf->trans_flag();
-  const auto &parameters = graph->inputs();
-  for (auto &item : parameters) {
-    if (!item->isa<Parameter>()) {
-      continue;
-    }
-    std::string parameter_name = item->fullname_with_scope();
-    if (!dump_conf->IsKernelNeedDump(parameter_name)) {
-      continue;
-    }
-    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
-    TypeId addr_type_id = addr->type_id();
-    std::string addr_format = addr->format();
-    ShapeVector int_shapes;
-    if (trans_flag) {
-      int_shapes = trans::GetRuntimePaddingShape(item, PARAMETER_OUTPUT_INDEX);
-    } else {
-      auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
-      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                           [](size_t inner_item) { return SizeToInt(inner_item); });
-    }
-
-    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
-
-    auto format = kOpFormat_DEFAULT;
-    string filepath = dump_path + '/' + parameter_name + '_' + "output_0";
-
-    DebugServices *debug_services = debugger->debug_services();
-    TensorLoader *tensor_loader = debug_services->tensor_loader();
-    std::string original_kernel_name = parameter_name;
-    size_t slot = 0;
-    auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
-                                               addr_type_id, addr_format, slot);
-
-    if (!ret) {
-      std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
-                          ", host_format:" + format + ".!";
-    }
-  }
-}
-}  // namespace
-
-bool GPUKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_LOG(INFO) << "Start dump step";
-  DumpConfPtr dump_conf = GetDumpConf();
-  MS_EXCEPTION_IF_NULL(dump_conf);
-  dump_conf->UpdataCurIter();
-  bool dump_flag = dump_conf->dump_enable();
-  if (!dump_flag) {
-    MS_LOG(INFO) << "Dump flag is disable, pass dump step";
-    return true;
-  }
-  uint32_t cur_iter = dump_conf->cur_iter();
-  if (dump_conf->dump_iter() != 0) {
-    if (cur_iter != dump_conf->dump_iter()) {
-      return true;
-    }
-  }
-  MS_LOG(INFO) << "Cur iter is " << cur_iter;
-  std::string net_name = dump_conf->dump_net_name();
-  std::string iterator = std::to_string(cur_iter);
-  std::string dump_path = dump_conf->dump_path();
-  if (dump_path.back() == '/') {
-    dump_path = dump_path + net_name + '/' + iterator;
-  } else {
-    dump_path = dump_path + '/' + net_name + '/' + iterator;
-  }
-
-  // dump output
-  DumpOutput(graph, dump_path, dump_conf, debugger);
-  // dump parameters
-  DumpParameters(graph, dump_path, dump_conf, debugger);
-
-  return true;
-}
-#endif
-
-#ifdef ENABLE_DEBUGGER
-namespace {
-void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
-                    const std::vector<mindspore::kernel::AddressPtr> &kernel_inputs,
-                    const std::vector<mindspore::kernel::AddressPtr> &kernel_workspaces,
-                    const std::vector<mindspore::kernel::AddressPtr> &kernel_outputs, int exec_order, void *stream_ptr,
-                    bool dump_enabled) {
-  // check if we should read the kernel data
-  bool read_data = false;
-  std::string kernel_name = kernel->fullname_with_scope();
-  if (debugger) {
-    debugger->SetCurNode(kernel_name);
-    if (dump_enabled) {
-      read_data = true;
-    } else if (debugger->debugger_enabled()) {
-      read_data = debugger->ReadNodeDataRequired();
-    }
-  }
-
-  if (!read_data) {
-    return;
-  }
-
-  // get inputs
-  auto input_size = AnfAlgo::GetInputTensorNum(kernel);
-  for (size_t j = 0; j < input_size; ++j) {
-    auto input_kernel = kernel->input(j + 1);
-    std::string input_kernel_name = input_kernel->fullname_with_scope();
-    auto addr = kernel_inputs[j];
-    auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
-    auto format = kOpFormat_DEFAULT;
-    auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
-    string input_tensor_name = input_kernel_name + ':' + "0";
-    ShapeVector int_shapes;
-    auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
-    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                         [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
-    if (!ret) {
-      MS_LOG(ERROR) << "LoadMemToHost:"
-                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
-    }
-  }
-
-  // get outputs
-  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
-  for (size_t j = 0; j < output_size; ++j) {
-    auto addr = kernel_outputs[j];
-    auto type = AnfAlgo::GetOutputInferDataType(kernel, j);
-    auto format = kOpFormat_DEFAULT;
-    auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
-    string tensor_name = kernel_name + ':' + std::to_string(j);
-    ShapeVector int_shapes;
-    auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
-    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                         [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
-    if (!ret) {
-      MS_LOG(ERROR) << "LoadMemToHost:"
-                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
-    }
-  }
-
-  debugger->PostExecuteNode();
-}
-
-void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
-  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
-    auto cur_step_num = debugger->step_num();
-    cur_step_num = cur_step_num + 1;
-    debugger->SetStepNum(cur_step_num);
-  }
-}
-
-void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
-  MS_EXCEPTION_IF_NULL(graph);
-  if (!(debugger && dump_enabled)) {
-    return;
-  }
-  const auto &parameters = graph->inputs();
-  // for parameters, set its execution order to be 0;
-  int exec_order = 0;
-  for (auto &item : parameters) {
-    if (!item->isa<Parameter>()) {
-      continue;
-    }
-    std::string parameter_name = item->fullname_with_scope();
-    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
-    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
-    auto format = kOpFormat_DEFAULT;
-    string tensor_name = parameter_name + ':' + "0";
-    auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
-    ShapeVector int_shapes;
-    auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
-    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
-                         [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
-    if (!ret) {
-      MS_LOG(ERROR) << "LoadMemToHost:"
-                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
-    }
-  }
-}
-
-void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
-  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
-    DebugServices *debug_services = debugger->debug_services();
-    TensorLoader *tensor_loader = debug_services->tensor_loader();
-    tensor_loader->EmptyCurrentTensor();
-  }
-}
-}  // namespace
-#endif
-
-DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
-                                                       TypeId type_id) {
-  return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
-}
-
-bool GPUKernelRuntime::InitDevice() {
-  if (GPUDeviceManager::GetInstance().device_count() <= 0) {
-    MS_LOG(ERROR) << "No GPU device found.";
-    return false;
-  }
-  const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
-  bool collective_inited = CollectiveInitializer::instance().collective_inited();
-  if (collective_inited && collective_handle_ != nullptr) {
-    auto get_local_rank_funcptr =
-      reinterpret_cast<GetLocalRankId>(dlsym(const_cast<void *>(collective_handle_), "local_rank_id"));
-    MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
-    device_id_ = IntToUint((*get_local_rank_funcptr)());
-  }
-  if (!GPUDeviceManager::GetInstance().is_device_id_init()) {
-    if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) {
-      MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_);
-      return false;
-    }
-  }
-  GPUDeviceManager::GetInstance().InitDevice();
-  stream_ = GPUDeviceManager::GetInstance().default_stream();
-  if (stream_ == nullptr) {
-    MS_LOG(ERROR) << "No default CUDA stream found.";
-    return false;
-  }
-  return true;
-}
-
-void GPUKernelRuntime::ReleaseDeviceRes() {
-  // For dataset mode.
-  if (GpuBufferMgr::GetInstance().IsInit()) {
-    if (!GpuBufferMgr::GetInstance().IsClosed()) {
-      if (!GpuBufferMgr::GetInstance().CloseNotify()) {
-        MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
-      }
-    }
-    CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
-  }
-
-  // Destroy remaining memory swap events and free host memory.
-  for (auto &item : mem_swap_map_) {
-    auto &mem_swap_manager = item.second;
-    MS_EXCEPTION_IF_NULL(mem_swap_manager);
-    if (mem_swap_manager->trigger_swap()) {
-      mem_swap_manager->ClearSwapQueue(false);
-      mem_swap_manager->ReleaseHostPinnedMem();
-    }
-  }
-
-  GPUDeviceManager::GetInstance().ReleaseDevice();
-  if (mem_manager_ != nullptr) {
-    mem_manager_->FreeDeviceMemory();
-  }
-
-  kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
-  MS_EXCEPTION_IF_NULL(bin_map);
-  bin_map->RemoveKernelCache();
-}
-
-void GPUKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                                 const std::unordered_set<ValueNodePtr> &value_nodes,
-                                                 const std::vector<CNodePtr> &execution_order) {
-  MS_LOG(INFO) << "Clear graph:" << graph_id << " GPU runtime resource";
-  // Release the kernel resource.
-  for (const auto &kernel : execution_order) {
-    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-    if (kernel_mod == nullptr) {
-      continue;
-    }
-    kernel_mod->ReleaseResource();
-  }
-  // Clear the output address of graph.
-  ClearOutputAddress(inputs, value_nodes, execution_order);
-}
-
-void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  MS_EXCEPTION_IF_NULL(mem_manager_);
-  mem_manager_->ResetDynamicMemory();
-  AssignStaticMemoryInput(graph);
-  AssignStaticMemoryValueNode(graph);
-  bool is_enable_dynamic_mem = context_ptr->get_param<bool>(MS_CTX_ENABLE_DYNAMIC_MEM_POOL);
-  if (is_enable_dynamic_mem) {
-    // Use the dynamic memory pool.
-    InitKernelRefCount(graph);
-    InitMemorySwapInfo(graph);
-    InitKernelOutputAddress(graph);
-    InitKernelWorkspaceAddress(graph);
-    SaveGraphOutputNode(graph);
-  } else {
-    AssignDynamicMemory(graph);
-  }
-}
-
-bool GPUKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
-  struct timeval start_time, end_time;
-  (void)gettimeofday(&start_time, nullptr);
-  bool ret = true;
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  bool is_enable_dynamic_mem = context_ptr->get_param<bool>(MS_CTX_ENABLE_DYNAMIC_MEM_POOL);
-  bool is_enable_pynative_infer = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
-  if (is_enable_dynamic_mem && !is_enable_pynative_infer) {
-    auto graph_id = graph->graph_id();
-    auto iter = mem_swap_map_.find(graph_id);
-    if (iter == mem_swap_map_.end()) {
-      MS_LOG(EXCEPTION) << "Find memory swap map failed.";
-    }
-    mem_swap_manager_ = iter->second;
-    MS_EXCEPTION_IF_NULL(mem_swap_manager_);
-    auto mem_reuse_iter = mem_reuse_util_map_.find(graph_id);
-    if (mem_reuse_iter == mem_reuse_util_map_.end()) {
-      MS_LOG(EXCEPTION) << "Find memory reuse map failed.";
-    }
-    mem_reuse_util_ = mem_reuse_iter->second;
-    MS_EXCEPTION_IF_NULL(mem_reuse_util_);
-
-    ret = RunOneStep(graph, debugger);
-  } else {
-    ret = LaunchKernel(graph);
-  }
-  (void)gettimeofday(&end_time, nullptr);
-  const uint64_t kUSecondInSecond = 1000000;
-  uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
-  cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
-  MS_LOG(DEBUG) << "GPU kernel runtime run graph in " << cost << " us";
-  return ret;
-}
-
-bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) {
-  bool ret = true;
-  auto graph_id = graph->graph_id();
-  if (!is_first_step_map_[graph_id]) {
-    // Normally run graph
-    ret = LaunchKernelDynamic(graph, debugger);
-  } else {
-    // Mock run first step
-    ret = LaunchKernelDynamic(graph, debugger, true, false);
-    if (ret) {
-      // Normally run graph
-      ret = LaunchKernelDynamic(graph, debugger);
-    } else {
-      // Trigger memory swap
-      ret = SearchMemSwapScheme(graph, debugger);
-    }
-    is_first_step_map_[graph_id] = false;
-  }
-  return ret;
-}
-
-bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
-  MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
-  bool ret = false;
-  ClearKernelOldOutputAndWorkspace(graph);
-  if (!mem_swap_manager_->mem_swap_init()) {
-    if (!mem_swap_manager_->Init(graph)) {
-      return false;
-    }
-  }
-
-  while (!ret) {
-    if (!mem_swap_manager_->RetreatSwapInfo()) {
-      return false;
-    }
-    ret = LaunchKernelDynamic(graph, debugger, true, false);
-    if (!ret) {
-      ClearKernelOldOutputAndWorkspace(graph);
-    }
-  }
-  mem_swap_manager_->AssignHostMemory();
-
-  // Time profiling
-  ret = LaunchKernelDynamic(graph, debugger, false, true);
-  if (!ret) {
-    return ret;
-  }
-  return RefineMemSwapScheme(graph, debugger);
-}
-
-bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
-  MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment.";
-  auto &kernels = graph->execution_order();
-  for (const auto &kernel : kernels) {
-    if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) {
-      continue;
-    }
-
-    size_t swap_in_task_num = mem_swap_manager_->QueryKernelTriggerSwapInTaskNum(kernel);
-    for (size_t swap_in_task_idx = 0; swap_in_task_idx < swap_in_task_num; swap_in_task_idx++) {
-      bool ret = false;
-      while (!ret) {
-        mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx);
-        ret = LaunchKernelDynamic(graph, debugger, true, false);
-        if (!ret) {
-          ClearKernelOldOutputAndWorkspace(graph);
-          ClearSwapInfo(true);
-        }
-      }
-    }
-  }
-  return true;
-}
-
-void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
-  MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
-  // Init the kernel reference count.
-  if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) {
-    MS_LOG(EXCEPTION) << "Init kernel reference count failed";
-  }
-  mem_reuse_util_ptr->SetKernelDefMap();
-  mem_reuse_util_ptr->SetReuseRefCount();
-  // Can't free the device address of graph output, so set the reference count of graph output specially.
-  mem_reuse_util_ptr->SetGraphOutputRefCount();
-  // Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
-  mem_reuse_util_ptr->SetSummaryNodesRefCount();
-  auto graph_id = graph->graph_id();
-  mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
-}
-
-void GPUKernelRuntime::InitMemorySwapInfo(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  GPUMemCopyManagerPtr gpu_mem_copy_manager = std::make_shared<GPUMemCopyManager>();
-  MS_EXCEPTION_IF_NULL(gpu_mem_copy_manager);
-  MemSwapManagerPtr mem_swap_manager = std::make_shared<MemSwapManager>(gpu_mem_copy_manager);
-  MS_EXCEPTION_IF_NULL(mem_swap_manager);
-  auto graph_id = graph->graph_id();
-  mem_swap_map_[graph_id] = mem_swap_manager;
-  is_first_step_map_[graph_id] = true;
-}
-
-void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto &kernels = graph->execution_order();
-  for (const auto &kernel : kernels) {
-    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-    MS_EXCEPTION_IF_NULL(kernel_mod);
-    auto output_sizes = kernel_mod->GetOutputSizeList();
-    for (size_t i = 0; i < output_sizes.size(); ++i) {
-      if (AnfAlgo::OutputAddrExist(kernel, i)) {
-        continue;
-      }
-      std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
-      auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
-      auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
-      AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
-    }
-  }
-}
-
-void GPUKernelRuntime::InitKernelWorkspaceAddress(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto &kernels = graph->execution_order();
-  for (const auto &kernel : kernels) {
-    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-    MS_EXCEPTION_IF_NULL(kernel_mod);
-    auto workspace_sizes = kernel_mod->GetWorkspaceSizeList();
-    for (size_t i = 0; i < workspace_sizes.size(); ++i) {
-      auto device_address = CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown);
-      AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get());
-    }
-  }
-}
-
-void GPUKernelRuntime::SaveGraphOutputNode(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto graph_id = graph->graph_id();
-  const auto &output_nodes = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem});
-  for (const auto &node : output_nodes) {
-    graph_output_map_[graph_id].insert(node);
-  }
-}
-
-bool GPUKernelRuntime::IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto graph_id = graph->graph_id();
-  auto iter = graph_output_map_.find(graph_id);
-  if (iter == graph_output_map_.end()) {
-    MS_LOG(EXCEPTION) << "Find graph output info failed.";
-  }
-  auto &graph_output_set = iter->second;
-  return (graph_output_set.find(kernel) != graph_output_set.end());
-}
-
-void GPUKernelRuntime::ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph) {
-  ClearKernelOutputAddress(graph);
-  ClearKernelWorkspaceAddress(graph);
-}
-
-void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto &kernels = graph->execution_order();
-  for (const auto &kernel : kernels) {
-    if (IsGraphOutput(graph, kernel)) {
-      continue;
-    }
-    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-    MS_EXCEPTION_IF_NULL(kernel_mod);
-    auto output_sizes = kernel_mod->GetOutputSizeList();
-    for (size_t i = 0; i < output_sizes.size(); ++i) {
-      if (!AnfAlgo::OutputAddrExist(kernel, i)) {
-        continue;
-      }
-      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
-      MS_EXCEPTION_IF_NULL(device_address);
-      if (device_address->ptr_) {
-        mem_manager_->FreeMemFromMemPool(device_address);
-      }
-      device_address->set_status(DeviceAddressStatus::kInDevice);
-    }
-  }
-}
-
-void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto &kernels = graph->execution_order();
-  for (const auto &kernel : kernels) {
-    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-    MS_EXCEPTION_IF_NULL(kernel_mod);
-    auto workspace_sizes = kernel_mod->GetWorkspaceSizeList();
-    for (size_t i = 0; i < workspace_sizes.size(); ++i) {
-      auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
-      MS_EXCEPTION_IF_NULL(device_address);
-      if (device_address->ptr_) {
-        mem_manager_->FreeMemFromMemPool(device_address);
-      }
-    }
-  }
-}
-
-bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock,
-                                           bool profiling) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
-  // Reset the reference count.
-  mem_reuse_util_->ResetDynamicUsedRefCount();
-  // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
-  AllocCommunicationOpDynamicRes(graph);
-
-#ifdef ENABLE_DEBUGGER
-  bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
-  if (!mock) {
-    UpdateStepNum(debugger, dump_enabled);
-  }
-#endif
-  auto &kernels = graph->execution_order();
-  int exec_order = 1;
-
-  auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
-  MS_EXCEPTION_IF_NULL(profiler_inst);
-
-  for (const auto &kernel : kernels) {
-    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-    MS_EXCEPTION_IF_NULL(kernel_mod);
-    AddressPtrList kernel_inputs;
-    AddressPtrList kernel_workspaces;
-    AddressPtrList kernel_outputs;
-    auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock);
-    if (!ret) {
-#ifdef ENABLE_DEBUGGER
-      if (!mock) {
-        // invalidate current data collected by the debugger
-        ClearCurrentData(debugger, dump_enabled);
-      }
-#endif
-      return false;
-    }
-    if (!mock) {
-      if (!profiling) {
-        if (profiler_inst->GetEnableFlag()) {
-          profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_);
-        }
-        CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_),
-                                 "Launch kernel failed.");
-        if (profiler_inst->GetEnableFlag()) {
-          profiler_inst->OpDataProducerEnd();
-          if (profiler_inst->GetSyncEnableFlag()) {
-            CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed.");
-          }
-        }
-      } else {
-        LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
-      }
-#ifdef ENABLE_DEBUGGER
-      // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
-      LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
-                     dump_enabled);
-#endif
-    }
-    exec_order = exec_order + 1;
-    FreeKernelDynamicRes(kernel);
-    if (!UpdateMemorySwapTask(kernel, mock, profiling)) {
-#ifdef ENABLE_DEBUGGER
-      if (!mock) {
-        // invalidate current data collected by the debugger
-        ClearCurrentData(debugger, dump_enabled);
-      }
-#endif
-      return false;
-    }
-  }
-  if (!mock) {
-#ifdef ENABLE_DEBUGGER
-    // collect weights and bias for dump mode
-    LoadParameters(graph, debugger, dump_enabled);
-#endif
-    CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
-  }
-  ClearSwapInfo(mock);
-  return true;
-}
-
-void GPUKernelRuntime::LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
-                                                     const AddressPtrList &workspace, const AddressPtrList &outputs) {
-  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-  MS_EXCEPTION_IF_NULL(kernel_mod);
-  float cost_time = 0;
-  DeviceEvent start = nullptr;
-  DeviceEvent end = nullptr;
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&start), "Failed to create event.");
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end), "Failed to create event.");
-
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(start, stream_), "Failed to record event to stream.");
-  CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(inputs, workspace, outputs, stream_), "Launch kernel failed.");
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(end, stream_), "Failed to record event to stream.");
-
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(start), "Failed to sync event.");
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(end), "Failed to sync event.");
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::ElapsedTime(&cost_time, start, end), "Failed to record elapsed time.");
-
-  mem_swap_manager_->AddKernelExecutionPerform(kernel, cost_time);
-
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(start), "Failed to destroy event.");
-  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(end), "Failed to destroy event.");
-}
-
-bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) {
-  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
-  const MemSwapInfoSet &mem_swap_info_set = mem_swap_manager_->QueryKernelMemSwapInfo(kernel);
-  for (auto &mem_swap_info : mem_swap_info_set) {
-    auto need_swap_kernel = mem_swap_manager_->QueryKernelByTopoOrder(mem_swap_info.topo_order_);
-    MS_EXCEPTION_IF_NULL(need_swap_kernel);
-    const HostAddress &host_address =
-      mem_swap_manager_->QueryKernelHostAddr(need_swap_kernel, mem_swap_info.output_idx_);
-    auto device_address = AnfAlgo::GetMutableOutputAddr(need_swap_kernel, mem_swap_info.output_idx_, false);
-
-    if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) {
-      if (mem_swap_manager_->QueryKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_)) {
-        mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address, mock);
-        mem_swap_manager_->AddKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_, false);
-      } else {
-        mem_manager_->FreeMemFromMemPool(device_address);
-        device_address->set_status(DeviceAddressStatus::kInHost);
-      }
-    } else if (mem_swap_info.swap_kind_ == SwapKind::kHostToDevice) {
-      auto status = device_address->status();
-      if (status == DeviceAddressStatus::kInDeviceToHost) {
-        device_address->set_status(DeviceAddressStatus::kInDevice);
-      } else if (status == DeviceAddressStatus::kInHost) {
-        if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_, mock)) {
-          return false;
-        }
-        float cost_time = 0;
-        mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address, mock, profiling,
-                                          &cost_time);
-        if (profiling) {
-          mem_swap_manager_->AddKernelSwapPerform(need_swap_kernel, mem_swap_info.output_idx_,
-                                                  std::make_pair(0, cost_time));
-        }
-      }
-    }
-  }
-  return true;
-}
-
-bool GPUKernelRuntime::UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) {
-  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
-  if (!mem_swap_manager_->trigger_swap()) {
-    return true;
-  }
-  if (mem_swap_manager_->QueryKernelTriggerSwap(kernel)) {
-    if (!mock) {
-      CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
-    }
-    if (!AddMemorySwapTask(kernel, mock, profiling)) {
-      return false;
-    }
-    if (!mock) {
-      CHECK_OP_RET_WITH_EXCEPT(mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost), "SyncCopyStream failed.");
-    }
-  }
-  return true;
-}
-
-void GPUKernelRuntime::UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock) {
-  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
-  if (!mem_swap_manager_->trigger_swap()) {
-    return;
-  }
-  while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) {
-    device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
-  }
-
-  auto status = device_address->status();
-  switch (status) {
-    case DeviceAddressStatus::kInDevice:
-      break;
-    case DeviceAddressStatus::kInDeviceToHost: {
-      device_address->set_status(DeviceAddressStatus::kInDevice);
-      break;
-    }
-    case DeviceAddressStatus::kInHostToDevice: {
-      while (device_address->status() != DeviceAddressStatus::kInDevice) {
-        while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) {
-          device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
-        }
-      }
-      break;
-    }
-    case DeviceAddressStatus::kInHost:
-      MS_LOG(WARNING) << "Unexpected device address status: " << status;
-      break;
-    default:
-      MS_LOG(EXCEPTION) << "Invaild device address status: " << status;
-  }
-}
-
-void GPUKernelRuntime::UpdateHostSwapOutQueue(bool mock) {
-  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
-  if (!mem_swap_manager_->trigger_swap()) {
-    return;
-  }
-  while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost, mock)) {
-    if (device_address_swap_out->status() == DeviceAddressStatus::kInDeviceToHost && device_address_swap_out->ptr_) {
-      device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
-      mem_manager_->FreeMemFromMemPool(device_address_swap_out);
-    }
-  }
-}
-
-void GPUKernelRuntime::ClearSwapInfo(bool mock) {
-  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
-  if (!mem_swap_manager_->trigger_swap()) {
-    return;
-  }
-  mem_swap_manager_->ClearSwapQueue(mock);
-  mem_swap_manager_->ResetHostAddrIsDirty();
-}
-
-bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock) {
-  MS_EXCEPTION_IF_NULL(mem_manager_);
-  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
-  auto ret = mem_manager_->MallocMemFromMemPool(device_address, size);
-  if (!ret) {
-    if (!mem_swap_manager_->trigger_swap()) {
-      return false;
-    }
-    if (!mock) {
-      mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost);
-    }
-    UpdateHostSwapOutQueue(mock);
-
-    ret = mem_manager_->MallocMemFromMemPool(device_address, size);
-    if (!ret) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
-                                             const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
-                                             AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs,
-                                             bool mock) {
-  if (!AllocKernelInputDynamicRes(kernel, kernel_inputs, mock)) {
-    return false;
-  }
-  if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs, mock)) {
-    return false;
-  }
-  if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces, mock)) {
-    return false;
-  }
-  return true;
-}
-
-bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
-                                                  bool mock) {
-  MS_EXCEPTION_IF_NULL(kernel);
-  MS_EXCEPTION_IF_NULL(kernel_inputs);
-  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
-  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    DeviceAddressPtr device_address;
-    if (mem_reuse_util_->is_all_nop_node()) {
-      // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
-      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
-    } else {
-      // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
-      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
-    }
-    MS_EXCEPTION_IF_NULL(device_address);
-    UpdateHostSwapInQueue(device_address, mock);
-    MS_EXCEPTION_IF_NULL(device_address->ptr_);
-    kernel::AddressPtr input = std::make_shared<kernel::Address>();
-    MS_EXCEPTION_IF_NULL(input);
-    input->addr = device_address->ptr_;
-    input->size = device_address->size_;
-    kernel_inputs->emplace_back(input);
-  }
-  return true;
-}
-
-bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
-                                                   const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_outputs,
-                                                   bool mock) {
-  MS_EXCEPTION_IF_NULL(kernel);
-  MS_EXCEPTION_IF_NULL(kernel_outputs);
-  UpdateHostSwapOutQueue(mock);
-  auto output_sizes = kernel_mod.GetOutputSizeList();
-  for (size_t i = 0; i < output_sizes.size(); ++i) {
-    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
-    MS_EXCEPTION_IF_NULL(device_address);
-    if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) {
-      return false;
-    }
-    kernel::AddressPtr output = std::make_shared<kernel::Address>();
-    MS_EXCEPTION_IF_NULL(output);
-    output->addr = device_address->ptr_;
-    output->size = output_sizes[i];
-    kernel_outputs->emplace_back(output);
-  }
-  return true;
-}
-
-bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
-                                                      const mindspore::AnfNodePtr &kernel,
-                                                      AddressPtrList *kernel_workspaces, bool mock) {
-  MS_EXCEPTION_IF_NULL(kernel);
-  MS_EXCEPTION_IF_NULL(kernel_workspaces);
-  auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
-  for (size_t i = 0; i < workspace_sizes.size(); ++i) {
-    if (workspace_sizes[i] == 0) {
-      kernel_workspaces->emplace_back(nullptr);
-      continue;
-    }
-    auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
-    if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i], mock)) {
-      return false;
-    }
-    kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
-    MS_EXCEPTION_IF_NULL(workspace);
-    workspace->addr = device_address->ptr_;
-    workspace->size = workspace_sizes[i];
-    kernel_workspaces->emplace_back(workspace);
-  }
-  return true;
-}
-
-void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto &kernels = graph->execution_order();
-  for (auto &kernel : kernels) {
-    MS_EXCEPTION_IF_NULL(kernel);
-    if (AnfAlgo::IsCommunicationOp(kernel)) {
-      AllocCommunicationOpInputDynamicRes(kernel);
-      AllocCommunicationOpOutputDynamicRes(kernel);
-    }
-  }
-}
-
-void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
-  MS_EXCEPTION_IF_NULL(kernel);
-  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
-  bool is_need_alloc_memory = false;
-  bool is_need_free_memory = false;
-  size_t total_size = 0;
-  std::vector<size_t> size_list;
-  DeviceAddressPtrList addr_list;
-  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-  MS_EXCEPTION_IF_NULL(kernel_mod);
-  auto intput_sizes = kernel_mod->GetInputSizeList();
-  for (size_t i = 0; i < intput_sizes.size(); ++i) {
-    DeviceAddressPtr device_address;
-    if (mem_reuse_util_->is_all_nop_node()) {
-      // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
-      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
-    } else {
-      // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
-      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
-    }
-    MS_EXCEPTION_IF_NULL(device_address);
-    if (device_address->ptr_ == nullptr) {
-      is_need_alloc_memory = true;
-    } else {
-      is_need_free_memory = true;
-    }
-    total_size += intput_sizes[i];
-    size_list.emplace_back(intput_sizes[i]);
-    addr_list.emplace_back(device_address);
-  }
-  AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
-}
-
-void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
-  MS_EXCEPTION_IF_NULL(kernel);
-  bool is_need_alloc_memory = false;
-  bool is_need_free_memory = false;
-  size_t total_size = 0;
-  std::vector<size_t> size_list;
-  DeviceAddressPtrList addr_list;
-  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-  MS_EXCEPTION_IF_NULL(kernel_mod);
-  auto output_sizes = kernel_mod->GetOutputSizeList();
-  for (size_t i = 0; i < output_sizes.size(); ++i) {
-    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
-    MS_EXCEPTION_IF_NULL(device_address);
-    if (device_address->ptr_ == nullptr) {
-      is_need_alloc_memory = true;
-    } else {
-      is_need_free_memory = true;
-    }
-    total_size += output_sizes[i];
-    size_list.emplace_back(output_sizes[i]);
-    addr_list.emplace_back(device_address);
-  }
-  AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
-}
-
-void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
-                                                  const DeviceAddressPtrList addr_list, size_t total_size,
-                                                  std::vector<size_t> size_list) {
-  MS_EXCEPTION_IF_NULL(mem_manager_);
-  if (!is_need_alloc_memory) {
-    return;
-  }
-  if (is_need_free_memory) {
-    for (const auto &iter : addr_list) {
-      MS_EXCEPTION_IF_NULL(iter);
-      // Free the inputs/outputs of communication kernel which are not released.
-      if (iter->ptr_ != nullptr) {
-        mem_manager_->FreeMemFromMemPool(iter);
-      }
-    }
-  }
-  auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
-  if (!ret) {
-    MS_LOG(EXCEPTION) << "Malloc device memory failed.";
-  }
-}
-
-void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) {
-  MS_EXCEPTION_IF_NULL(kernel);
-  MS_EXCEPTION_IF_NULL(mem_manager_);
-  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
-  auto cnode = kernel->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(cnode);
-  if (AnfAlgo::IsCommunicationOp(kernel)) {
-    return;
-  }
-  // Free the input of kernel by reference count.
-  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    auto kernel_ref_count_ptr = mem_reuse_util_->GetKernelInputRef(cnode, i);
-    if (kernel_ref_count_ptr == nullptr) {
-      continue;
-    }
-    kernel_ref_count_ptr->ref_count_dynamic_use_--;
-    if (kernel_ref_count_ptr->ref_count_dynamic_use_ < 0) {
-      MS_LOG(EXCEPTION) << "Check dynamic reference count failed.";
-    }
-    if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
-      DeviceAddressPtr device_address;
-      if (mem_reuse_util_->is_all_nop_node()) {
-        // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
-        device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
-      } else {
-        // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
-        device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
-      }
-      mem_manager_->FreeMemFromMemPool(device_address);
-      device_address->set_status(DeviceAddressStatus::kInDevice);
-    }
-  }
-  // Free the output of kernel, if output has no reference.
-  for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
-    auto kernel_ref_count_ptr = mem_reuse_util_->GetRef(cnode, i);
-    if (kernel_ref_count_ptr == nullptr) {
-      continue;
-    }
-    if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
-      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
-      mem_manager_->FreeMemFromMemPool(device_address);
-      device_address->set_status(DeviceAddressStatus::kInDevice);
-    }
-  }
-  // Free the workspace of kernel.
-  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
-  MS_EXCEPTION_IF_NULL(kernel_mod);
-  for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) {
-    auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
-    MS_EXCEPTION_IF_NULL(device_address);
-    if (device_address->ptr_) {
-      mem_manager_->FreeMemFromMemPool(device_address);
-    }
-  }
-}
-}  // namespace gpu
-}  // namespace device
-}  // namespace mindspore
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "runtime/device/gpu/gpu_kernel_runtime.h"
+#include <algorithm>
+#include "runtime/device/gpu/gpu_device_address.h"
+#include "runtime/device/gpu/cuda_driver.h"
+#include "runtime/device/gpu/gpu_buffer_mgr.h"
+#include "runtime/device/gpu/gpu_device_manager.h"
+#include "runtime/device/gpu/gpu_memory_allocator.h"
+#include "runtime/device/gpu/distribution/collective_init.h"
+#include "utils/convert_utils.h"
+#include "utils/ms_context.h"
+#include "runtime/device/kernel_runtime_manager.h"
+#include "runtime/device/gpu/gpu_common.h"
+#include "utils/ms_utils.h"
+#include "runtime/device/gpu/gpu_memory_manager.h"
+#include "backend/kernel_compiler/common_utils.h"
+#include "runtime/device/gpu/gpu_memory_copy_manager.h"
+#include "common/trans.h"
+#include "ir/dtype.h"
+#include "profiler/device/gpu/gpu_profiling.h"
+#include "utils/shape_utils.h"
+#ifdef ENABLE_DEBUGGER
+#include "debug/debug_services.h"
+#endif
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+using mindspore::device::memswap::MemSwapInfoSet;
+using mindspore::device::memswap::MemSwapManager;
+using mindspore::device::memswap::SwapKind;
+static const size_t PARAMETER_OUTPUT_INDEX = 0;
+bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
+
+bool GPUKernelRuntime::Init() {
+  if (device_init_ == true) {
+    GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory();
+    return true;
+  }
+  bool ret = false;
+#ifdef ENABLE_DUMP_E2E
+  ret = SetDumpConf();
+  if (!ret) {
+    MS_LOG(INFO) << "No dump conf to set!";
+  }
+#endif
+
+  ret = InitDevice();
+  if (!ret) {
+    MS_LOG(ERROR) << "InitDevice error.";
+    return ret;
+  }
+  mem_manager_ = std::make_shared<GPUMemoryManager>();
+  MS_EXCEPTION_IF_NULL(mem_manager_);
+  mem_manager_->MallocDeviceMemory();
+  const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
+  bool collective_inited = CollectiveInitializer::instance().collective_inited();
+  if (collective_inited && collective_handle_ != nullptr) {
+    auto init_nccl_comm_funcptr =
+      reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
+    MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
+    (*init_nccl_comm_funcptr)();
+  }
+  device_init_ = true;
+  return ret;
+}
+
+#ifdef ENABLE_DUMP_E2E
+namespace {
+void DumpOutput(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
+                Debugger *debugger) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  bool trans_flag = dump_conf->trans_flag();
+  const auto &apply_kernels = graph->execution_order();
+  for (const auto &node : apply_kernels) {
+    MS_EXCEPTION_IF_NULL(node);
+    auto node_name = AnfAlgo::GetCNodeName(node);
+    std::string kernel_name = node->fullname_with_scope();
+    if (!dump_conf->IsKernelNeedDump(kernel_name)) {
+      continue;
+    }
+    const std::string strsrc = "/";
+    const std::string strdst = "--";
+    std::string::size_type pos = 0;
+    std::string::size_type srclen = strsrc.size();
+    std::string::size_type dstlen = strdst.size();
+    while ((pos = kernel_name.find(strsrc, pos)) != std::string::npos) {
+      kernel_name.replace(pos, srclen, strdst);
+      pos += dstlen;
+    }
+    auto output_size = AnfAlgo::GetOutputTensorNum(node);
+    for (size_t j = 0; j < output_size; ++j) {
+      auto addr = AnfAlgo::GetOutputAddr(node, j);
+      TypeId addr_type_id = addr->type_id();
+      std::string addr_format = addr->format();
+      ShapeVector int_shapes;
+      if (trans_flag) {
+        int_shapes = trans::GetRuntimePaddingShape(node, j);
+      } else {
+        auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
+        (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                             [](size_t inner_item) { return SizeToInt(inner_item); });
+      }
+
+      auto type = AnfAlgo::GetOutputInferDataType(node, j);
+
+      auto format = kOpFormat_DEFAULT;
+      string filepath = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j);
+
+      DebugServices *debug_services = debugger->debug_services();
+      TensorLoader *tensor_loader = debug_services->tensor_loader();
+      std::string original_kernel_name = node->fullname_with_scope();
+      size_t slot = j;
+      auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
+                                                 addr_type_id, addr_format, slot);
+
+      if (!ret) {
+        std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
+                            ", host_format:" + format + ".!";
+      }
+    }
+  }
+}
+
+void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
+                    Debugger *debugger) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  bool trans_flag = dump_conf->trans_flag();
+  const auto &parameters = graph->inputs();
+  for (auto &item : parameters) {
+    if (!item->isa<Parameter>()) {
+      continue;
+    }
+    std::string parameter_name = item->fullname_with_scope();
+    if (!dump_conf->IsKernelNeedDump(parameter_name)) {
+      continue;
+    }
+    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
+    TypeId addr_type_id = addr->type_id();
+    std::string addr_format = addr->format();
+    ShapeVector int_shapes;
+    if (trans_flag) {
+      int_shapes = trans::GetRuntimePaddingShape(item, PARAMETER_OUTPUT_INDEX);
+    } else {
+      auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
+      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                           [](size_t inner_item) { return SizeToInt(inner_item); });
+    }
+
+    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
+
+    auto format = kOpFormat_DEFAULT;
+    string filepath = dump_path + '/' + parameter_name + '_' + "output_0";
+
+    DebugServices *debug_services = debugger->debug_services();
+    TensorLoader *tensor_loader = debug_services->tensor_loader();
+    std::string original_kernel_name = parameter_name;
+    size_t slot = 0;
+    auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
+                                               addr_type_id, addr_format, slot);
+
+    if (!ret) {
+      std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
+                          ", host_format:" + format + ".!";
+    }
+  }
+}
+}  // namespace
+
+bool GPUKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_LOG(INFO) << "Start dump step";
+  DumpConfPtr dump_conf = GetDumpConf();
+  MS_EXCEPTION_IF_NULL(dump_conf);
+  dump_conf->UpdataCurIter();
+  bool dump_flag = dump_conf->dump_enable();
+  if (!dump_flag) {
+    MS_LOG(INFO) << "Dump flag is disable, pass dump step";
+    return true;
+  }
+  uint32_t cur_iter = dump_conf->cur_iter();
+  if (dump_conf->dump_iter() != 0) {
+    if (cur_iter != dump_conf->dump_iter()) {
+      return true;
+    }
+  }
+  MS_LOG(INFO) << "Cur iter is " << cur_iter;
+  std::string net_name = dump_conf->dump_net_name();
+  std::string iterator = std::to_string(cur_iter);
+  std::string dump_path = dump_conf->dump_path();
+  if (dump_path.back() == '/') {
+    dump_path = dump_path + net_name + '/' + iterator;
+  } else {
+    dump_path = dump_path + '/' + net_name + '/' + iterator;
+  }
+
+  // dump output
+  DumpOutput(graph, dump_path, dump_conf, debugger);
+  // dump parameters
+  DumpParameters(graph, dump_path, dump_conf, debugger);
+
+  return true;
+}
+#endif
+
+#ifdef ENABLE_DEBUGGER
+namespace {
+void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
+                    const std::vector<mindspore::kernel::AddressPtr> &kernel_inputs,
+                    const std::vector<mindspore::kernel::AddressPtr> &kernel_workspaces,
+                    const std::vector<mindspore::kernel::AddressPtr> &kernel_outputs, int exec_order, void *stream_ptr,
+                    bool dump_enabled) {
+  // check if we should read the kernel data
+  bool read_data = false;
+  std::string kernel_name = kernel->fullname_with_scope();
+  if (debugger) {
+    debugger->SetCurNode(kernel_name);
+    if (dump_enabled) {
+      read_data = true;
+    } else if (debugger->debugger_enabled()) {
+      read_data = debugger->ReadNodeDataRequired();
+    }
+  }
+
+  if (!read_data) {
+    return;
+  }
+
+  // get inputs
+  auto input_size = AnfAlgo::GetInputTensorNum(kernel);
+  for (size_t j = 0; j < input_size; ++j) {
+    auto input_kernel = kernel->input(j + 1);
+    std::string input_kernel_name = input_kernel->fullname_with_scope();
+    auto addr = kernel_inputs[j];
+    auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
+    auto format = kOpFormat_DEFAULT;
+    auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
+    string input_tensor_name = input_kernel_name + ':' + "0";
+    ShapeVector int_shapes;
+    auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
+    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                         [](size_t inner_item) { return SizeToInt(inner_item); });
+    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+
+  // get outputs
+  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
+  for (size_t j = 0; j < output_size; ++j) {
+    auto addr = kernel_outputs[j];
+    auto type = AnfAlgo::GetOutputInferDataType(kernel, j);
+    auto format = kOpFormat_DEFAULT;
+    auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
+    string tensor_name = kernel_name + ':' + std::to_string(j);
+    ShapeVector int_shapes;
+    auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
+    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                         [](size_t inner_item) { return SizeToInt(inner_item); });
+    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+
+  debugger->PostExecuteNode();
+}
+
+void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
+  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
+    auto cur_step_num = debugger->step_num();
+    cur_step_num = cur_step_num + 1;
+    debugger->SetStepNum(cur_step_num);
+  }
+}
+
+void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
+  MS_EXCEPTION_IF_NULL(graph);
+  if (!(debugger && dump_enabled)) {
+    return;
+  }
+  const auto &parameters = graph->inputs();
+  // for parameters, set its execution order to be 0;
+  int exec_order = 0;
+  for (auto &item : parameters) {
+    if (!item->isa<Parameter>()) {
+      continue;
+    }
+    std::string parameter_name = item->fullname_with_scope();
+    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
+    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
+    auto format = kOpFormat_DEFAULT;
+    string tensor_name = parameter_name + ':' + "0";
+    auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
+    ShapeVector int_shapes;
+    auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
+    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                         [](size_t inner_item) { return SizeToInt(inner_item); });
+    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
+    if (!ret) {
+      MS_LOG(ERROR) << "LoadMemToHost:"
+                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+    }
+  }
+}
+
+void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
+  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
+    DebugServices *debug_services = debugger->debug_services();
+    TensorLoader *tensor_loader = debug_services->tensor_loader();
+    tensor_loader->EmptyCurrentTensor();
+  }
+}
+}  // namespace
+#endif
+
+DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
+                                                       TypeId type_id) {
+  return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
+}
+
+bool GPUKernelRuntime::InitDevice() {
+  if (GPUDeviceManager::GetInstance().device_count() <= 0) {
+    MS_LOG(ERROR) << "No GPU device found.";
+    return false;
+  }
+  const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
+  bool collective_inited = CollectiveInitializer::instance().collective_inited();
+  if (collective_inited && collective_handle_ != nullptr) {
+    auto get_local_rank_funcptr =
+      reinterpret_cast<GetLocalRankId>(dlsym(const_cast<void *>(collective_handle_), "local_rank_id"));
+    MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
+    device_id_ = IntToUint((*get_local_rank_funcptr)());
+  }
+  if (!GPUDeviceManager::GetInstance().is_device_id_init()) {
+    if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) {
+      MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_);
+      return false;
+    }
+  }
+  GPUDeviceManager::GetInstance().InitDevice();
+  stream_ = GPUDeviceManager::GetInstance().default_stream();
+  if (stream_ == nullptr) {
+    MS_LOG(ERROR) << "No default CUDA stream found.";
+    return false;
+  }
+  return true;
+}
+
+void GPUKernelRuntime::ReleaseDeviceRes() {
+  // For dataset mode.
+  if (GpuBufferMgr::GetInstance().IsInit()) {
+    if (!GpuBufferMgr::GetInstance().IsClosed()) {
+      if (!GpuBufferMgr::GetInstance().CloseNotify()) {
+        MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
+      }
+    }
+    CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
+  }
+
+  // Destroy remaining memory swap events and free host memory.
+  for (auto &item : mem_swap_map_) {
+    auto &mem_swap_manager = item.second;
+    MS_EXCEPTION_IF_NULL(mem_swap_manager);
+    if (mem_swap_manager->trigger_swap()) {
+      mem_swap_manager->ClearSwapQueue(false);
+      mem_swap_manager->ReleaseHostPinnedMem();
+    }
+  }
+
+  GPUDeviceManager::GetInstance().ReleaseDevice();
+  if (mem_manager_ != nullptr) {
+    mem_manager_->FreeDeviceMemory();
+  }
+
+  kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
+  MS_EXCEPTION_IF_NULL(bin_map);
+  bin_map->RemoveKernelCache();
+}
+
+void GPUKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
+                                                 const std::unordered_set<ValueNodePtr> &value_nodes,
+                                                 const std::vector<CNodePtr> &execution_order) {
+  MS_LOG(INFO) << "Clear graph:" << graph_id << " GPU runtime resource";
+  // Release the kernel resource.
+  for (const auto &kernel : execution_order) {
+    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+    if (kernel_mod == nullptr) {
+      continue;
+    }
+    kernel_mod->ReleaseResource();
+  }
+  // Clear the output address of graph.
+  ClearOutputAddress(inputs, value_nodes, execution_order);
+}
+
+void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  MS_EXCEPTION_IF_NULL(mem_manager_);
+  mem_manager_->ResetDynamicMemory();
+  AssignStaticMemoryInput(graph);
+  AssignStaticMemoryValueNode(graph);
+  bool is_enable_dynamic_mem = context_ptr->get_param<bool>(MS_CTX_ENABLE_DYNAMIC_MEM_POOL);
+  if (is_enable_dynamic_mem) {
+    // Use the dynamic memory pool.
+    InitKernelRefCount(graph);
+    InitMemorySwapInfo(graph);
+    InitKernelOutputAddress(graph);
+    InitKernelWorkspaceAddress(graph);
+    SaveGraphOutputNode(graph);
+  } else {
+    AssignDynamicMemory(graph);
+  }
+}
+
+bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) {
+  struct timeval start_time, end_time;
+  (void)gettimeofday(&start_time, nullptr);
+  bool ret = true;
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool is_enable_dynamic_mem = context_ptr->get_param<bool>(MS_CTX_ENABLE_DYNAMIC_MEM_POOL);
+  bool is_enable_pynative_infer = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
+  if (is_enable_dynamic_mem && !is_enable_pynative_infer) {
+    auto graph_id = graph->graph_id();
+    auto iter = mem_swap_map_.find(graph_id);
+    if (iter == mem_swap_map_.end()) {
+      MS_LOG(EXCEPTION) << "Find memory swap map failed.";
+    }
+    mem_swap_manager_ = iter->second;
+    MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+    auto mem_reuse_iter = mem_reuse_util_map_.find(graph_id);
+    if (mem_reuse_iter == mem_reuse_util_map_.end()) {
+      MS_LOG(EXCEPTION) << "Find memory reuse map failed.";
+    }
+    mem_reuse_util_ = mem_reuse_iter->second;
+    MS_EXCEPTION_IF_NULL(mem_reuse_util_);
+
+    ret = RunOneStep(graph, debugger);
+  } else {
+    ret = LaunchKernel(graph);
+  }
+  (void)gettimeofday(&end_time, nullptr);
+  const uint64_t kUSecondInSecond = 1000000;
+  uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
+  cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
+  MS_LOG(DEBUG) << "GPU kernel runtime run graph in " << cost << " us";
+  return ret;
+}
+
+bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) {
+  bool ret = true;
+  auto graph_id = graph->graph_id();
+  if (!is_first_step_map_[graph_id]) {
+    // Normally run graph
+    ret = LaunchKernelDynamic(graph, debugger);
+  } else {
+    // Mock run first step
+    ret = LaunchKernelDynamic(graph, debugger, true, false);
+    if (ret) {
+      // Normally run graph
+      ret = LaunchKernelDynamic(graph, debugger);
+    } else {
+      // Trigger memory swap
+      ret = SearchMemSwapScheme(graph, debugger);
+    }
+    is_first_step_map_[graph_id] = false;
+  }
+  return ret;
+}
+
+bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
+  MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
+  bool ret = false;
+  ClearKernelOldOutputAndWorkspace(graph);
+  if (!mem_swap_manager_->mem_swap_init()) {
+    if (!mem_swap_manager_->Init(graph)) {
+      return false;
+    }
+  }
+
+  while (!ret) {
+    if (!mem_swap_manager_->RetreatSwapInfo()) {
+      return false;
+    }
+    ret = LaunchKernelDynamic(graph, debugger, true, false);
+    if (!ret) {
+      ClearKernelOldOutputAndWorkspace(graph);
+    }
+  }
+  mem_swap_manager_->AssignHostMemory();
+
+  // Time profiling
+  ret = LaunchKernelDynamic(graph, debugger, false, true);
+  if (!ret) {
+    return ret;
+  }
+  return RefineMemSwapScheme(graph, debugger);
+}
+
+bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
+  MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment.";
+  auto &kernels = graph->execution_order();
+  for (const auto &kernel : kernels) {
+    if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) {
+      continue;
+    }
+
+    size_t swap_in_task_num = mem_swap_manager_->QueryKernelTriggerSwapInTaskNum(kernel);
+    for (size_t swap_in_task_idx = 0; swap_in_task_idx < swap_in_task_num; swap_in_task_idx++) {
+      bool ret = false;
+      while (!ret) {
+        mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx);
+        ret = LaunchKernelDynamic(graph, debugger, true, false);
+        if (!ret) {
+          ClearKernelOldOutputAndWorkspace(graph);
+          ClearSwapInfo(true);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
+  // Init the kernel reference count.
+  if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) {
+    MS_LOG(EXCEPTION) << "Init kernel reference count failed";
+  }
+  mem_reuse_util_ptr->SetKernelDefMap();
+  mem_reuse_util_ptr->SetReuseRefCount();
+  // Can't free the device address of graph output, so set the reference count of graph output specially.
+  mem_reuse_util_ptr->SetGraphOutputRefCount();
+  // Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
+  mem_reuse_util_ptr->SetSummaryNodesRefCount();
+  auto graph_id = graph->graph_id();
+  mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
+}
+
+void GPUKernelRuntime::InitMemorySwapInfo(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  GPUMemCopyManagerPtr gpu_mem_copy_manager = std::make_shared<GPUMemCopyManager>();
+  MS_EXCEPTION_IF_NULL(gpu_mem_copy_manager);
+  MemSwapManagerPtr mem_swap_manager = std::make_shared<MemSwapManager>(gpu_mem_copy_manager);
+  MS_EXCEPTION_IF_NULL(mem_swap_manager);
+  auto graph_id = graph->graph_id();
+  mem_swap_map_[graph_id] = mem_swap_manager;
+  is_first_step_map_[graph_id] = true;
+}
+
+void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto &kernels = graph->execution_order();
+  for (const auto &kernel : kernels) {
+    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    auto output_sizes = kernel_mod->GetOutputSizeList();
+    for (size_t i = 0; i < output_sizes.size(); ++i) {
+      if (AnfAlgo::OutputAddrExist(kernel, i)) {
+        continue;
+      }
+      std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
+      auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
+      auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
+      AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
+    }
+  }
+}
+
+void GPUKernelRuntime::InitKernelWorkspaceAddress(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto &kernels = graph->execution_order();
+  for (const auto &kernel : kernels) {
+    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    auto workspace_sizes = kernel_mod->GetWorkspaceSizeList();
+    for (size_t i = 0; i < workspace_sizes.size(); ++i) {
+      auto device_address = CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown);
+      AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get());
+    }
+  }
+}
+
+void GPUKernelRuntime::SaveGraphOutputNode(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto graph_id = graph->graph_id();
+  const auto &output_nodes = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem});
+  for (const auto &node : output_nodes) {
+    graph_output_map_[graph_id].insert(node);
+  }
+}
+
+bool GPUKernelRuntime::IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto graph_id = graph->graph_id();
+  auto iter = graph_output_map_.find(graph_id);
+  if (iter == graph_output_map_.end()) {
+    MS_LOG(EXCEPTION) << "Find graph output info failed.";
+  }
+  auto &graph_output_set = iter->second;
+  return (graph_output_set.find(kernel) != graph_output_set.end());
+}
+
+void GPUKernelRuntime::ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph) {
+  ClearKernelOutputAddress(graph);
+  ClearKernelWorkspaceAddress(graph);
+}
+
+void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto &kernels = graph->execution_order();
+  for (const auto &kernel : kernels) {
+    if (IsGraphOutput(graph, kernel)) {
+      continue;
+    }
+    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    auto output_sizes = kernel_mod->GetOutputSizeList();
+    for (size_t i = 0; i < output_sizes.size(); ++i) {
+      if (!AnfAlgo::OutputAddrExist(kernel, i)) {
+        continue;
+      }
+      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
+      MS_EXCEPTION_IF_NULL(device_address);
+      if (device_address->ptr_) {
+        mem_manager_->FreeMemFromMemPool(device_address);
+      }
+      device_address->set_status(DeviceAddressStatus::kInDevice);
+    }
+  }
+}
+
+void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto &kernels = graph->execution_order();
+  for (const auto &kernel : kernels) {
+    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    auto workspace_sizes = kernel_mod->GetWorkspaceSizeList();
+    for (size_t i = 0; i < workspace_sizes.size(); ++i) {
+      auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
+      MS_EXCEPTION_IF_NULL(device_address);
+      if (device_address->ptr_) {
+        mem_manager_->FreeMemFromMemPool(device_address);
+      }
+    }
+  }
+}
+
+bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock,
+                                           bool profiling) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
+  // Reset the reference count.
+  mem_reuse_util_->ResetDynamicUsedRefCount();
+  // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
+  AllocCommunicationOpDynamicRes(graph);
+
+#ifdef ENABLE_DEBUGGER
+  bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
+  if (!mock) {
+    UpdateStepNum(debugger, dump_enabled);
+  }
+#endif
+  auto &kernels = graph->execution_order();
+  int exec_order = 1;
+
+  auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
+  MS_EXCEPTION_IF_NULL(profiler_inst);
+
+  for (const auto &kernel : kernels) {
+    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    AddressPtrList kernel_inputs;
+    AddressPtrList kernel_workspaces;
+    AddressPtrList kernel_outputs;
+    auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock);
+    if (!ret) {
+#ifdef ENABLE_DEBUGGER
+      if (!mock) {
+        // invalidate current data collected by the debugger
+        ClearCurrentData(debugger, dump_enabled);
+      }
+#endif
+      return false;
+    }
+    if (!mock) {
+      if (!profiling) {
+        if (profiler_inst->GetEnableFlag()) {
+          profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_);
+        }
+        CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_),
+                                 "Launch kernel failed.");
+        if (profiler_inst->GetEnableFlag()) {
+          profiler_inst->OpDataProducerEnd();
+          if (profiler_inst->GetSyncEnableFlag()) {
+            CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed.");
+          }
+        }
+      } else {
+        LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
+      }
+#ifdef ENABLE_DEBUGGER
+      // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
+      LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
+                     dump_enabled);
+#endif
+    }
+    exec_order = exec_order + 1;
+    FreeKernelDynamicRes(kernel);
+    if (!UpdateMemorySwapTask(kernel, mock, profiling)) {
+#ifdef ENABLE_DEBUGGER
+      if (!mock) {
+        // invalidate current data collected by the debugger
+        ClearCurrentData(debugger, dump_enabled);
+      }
+#endif
+      return false;
+    }
+  }
+  if (!mock) {
+#ifdef ENABLE_DEBUGGER
+    // collect weights and bias for dump mode
+    LoadParameters(graph, debugger, dump_enabled);
+#endif
+    CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
+  }
+  ClearSwapInfo(mock);
+  return true;
+}
+
+void GPUKernelRuntime::LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
+                                                     const AddressPtrList &workspace, const AddressPtrList &outputs) {
+  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  float cost_time = 0;
+  DeviceEvent start = nullptr;
+  DeviceEvent end = nullptr;
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&start), "Failed to create event.");
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end), "Failed to create event.");
+
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(start, stream_), "Failed to record event to stream.");
+  CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(inputs, workspace, outputs, stream_), "Launch kernel failed.");
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(end, stream_), "Failed to record event to stream.");
+
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(start), "Failed to sync event.");
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(end), "Failed to sync event.");
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::ElapsedTime(&cost_time, start, end), "Failed to record elapsed time.");
+
+  mem_swap_manager_->AddKernelExecutionPerform(kernel, cost_time);
+
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(start), "Failed to destroy event.");
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(end), "Failed to destroy event.");
+}
+
+bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) {
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+  const MemSwapInfoSet &mem_swap_info_set = mem_swap_manager_->QueryKernelMemSwapInfo(kernel);
+  for (auto &mem_swap_info : mem_swap_info_set) {
+    auto need_swap_kernel = mem_swap_manager_->QueryKernelByTopoOrder(mem_swap_info.topo_order_);
+    MS_EXCEPTION_IF_NULL(need_swap_kernel);
+    const HostAddress &host_address =
+      mem_swap_manager_->QueryKernelHostAddr(need_swap_kernel, mem_swap_info.output_idx_);
+    auto device_address = AnfAlgo::GetMutableOutputAddr(need_swap_kernel, mem_swap_info.output_idx_, false);
+
+    if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) {
+      if (mem_swap_manager_->QueryKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_)) {
+        mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address, mock);
+        mem_swap_manager_->AddKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_, false);
+      } else {
+        mem_manager_->FreeMemFromMemPool(device_address);
+        device_address->set_status(DeviceAddressStatus::kInHost);
+      }
+    } else if (mem_swap_info.swap_kind_ == SwapKind::kHostToDevice) {
+      auto status = device_address->status();
+      if (status == DeviceAddressStatus::kInDeviceToHost) {
+        device_address->set_status(DeviceAddressStatus::kInDevice);
+      } else if (status == DeviceAddressStatus::kInHost) {
+        if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_, mock)) {
+          return false;
+        }
+        float cost_time = 0;
+        mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address, mock, profiling,
+                                          &cost_time);
+        if (profiling) {
+          mem_swap_manager_->AddKernelSwapPerform(need_swap_kernel, mem_swap_info.output_idx_,
+                                                  std::make_pair(0, cost_time));
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool GPUKernelRuntime::UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) {
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+  if (!mem_swap_manager_->trigger_swap()) {
+    return true;
+  }
+  if (mem_swap_manager_->QueryKernelTriggerSwap(kernel)) {
+    if (!mock) {
+      CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
+    }
+    if (!AddMemorySwapTask(kernel, mock, profiling)) {
+      return false;
+    }
+    if (!mock) {
+      CHECK_OP_RET_WITH_EXCEPT(mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost), "SyncCopyStream failed.");
+    }
+  }
+  return true;
+}
+
+void GPUKernelRuntime::UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock) {
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+  if (!mem_swap_manager_->trigger_swap()) {
+    return;
+  }
+  while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) {
+    device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
+  }
+
+  auto status = device_address->status();
+  switch (status) {
+    case DeviceAddressStatus::kInDevice:
+      break;
+    case DeviceAddressStatus::kInDeviceToHost: {
+      device_address->set_status(DeviceAddressStatus::kInDevice);
+      break;
+    }
+    case DeviceAddressStatus::kInHostToDevice: {
+      while (device_address->status() != DeviceAddressStatus::kInDevice) {
+        while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) {
+          device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
+        }
+      }
+      break;
+    }
+    case DeviceAddressStatus::kInHost:
+      MS_LOG(WARNING) << "Unexpected device address status: " << status;
+      break;
+    default:
+      MS_LOG(EXCEPTION) << "Invaild device address status: " << status;
+  }
+}
+
+void GPUKernelRuntime::UpdateHostSwapOutQueue(bool mock) {
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+  if (!mem_swap_manager_->trigger_swap()) {
+    return;
+  }
+  while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost, mock)) {
+    if (device_address_swap_out->status() == DeviceAddressStatus::kInDeviceToHost && device_address_swap_out->ptr_) {
+      device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
+      mem_manager_->FreeMemFromMemPool(device_address_swap_out);
+    }
+  }
+}
+
+void GPUKernelRuntime::ClearSwapInfo(bool mock) {
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+  if (!mem_swap_manager_->trigger_swap()) {
+    return;
+  }
+  mem_swap_manager_->ClearSwapQueue(mock);
+  mem_swap_manager_->ResetHostAddrIsDirty();
+}
+
+bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock) {
+  MS_EXCEPTION_IF_NULL(mem_manager_);
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+  auto ret = mem_manager_->MallocMemFromMemPool(device_address, size);
+  if (!ret) {
+    if (!mem_swap_manager_->trigger_swap()) {
+      return false;
+    }
+    if (!mock) {
+      mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost);
+    }
+    UpdateHostSwapOutQueue(mock);
+
+    ret = mem_manager_->MallocMemFromMemPool(device_address, size);
+    if (!ret) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
+                                             const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
+                                             AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs,
+                                             bool mock) {
+  if (!AllocKernelInputDynamicRes(kernel, kernel_inputs, mock)) {
+    return false;
+  }
+  if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs, mock)) {
+    return false;
+  }
+  if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces, mock)) {
+    return false;
+  }
+  return true;
+}
+
+bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
+                                                  bool mock) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_inputs);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
+  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
+    DeviceAddressPtr device_address;
+    if (mem_reuse_util_->is_all_nop_node()) {
+      // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+    } else {
+      // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
+    }
+    MS_EXCEPTION_IF_NULL(device_address);
+    UpdateHostSwapInQueue(device_address, mock);
+    MS_EXCEPTION_IF_NULL(device_address->ptr_);
+    kernel::AddressPtr input = std::make_shared<kernel::Address>();
+    MS_EXCEPTION_IF_NULL(input);
+    input->addr = device_address->ptr_;
+    input->size = device_address->size_;
+    kernel_inputs->emplace_back(input);
+  }
+  return true;
+}
+
+bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
+                                                   const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_outputs,
+                                                   bool mock) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_outputs);
+  UpdateHostSwapOutQueue(mock);
+  auto output_sizes = kernel_mod.GetOutputSizeList();
+  for (size_t i = 0; i < output_sizes.size(); ++i) {
+    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
+    MS_EXCEPTION_IF_NULL(device_address);
+    if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) {
+      return false;
+    }
+    kernel::AddressPtr output = std::make_shared<kernel::Address>();
+    MS_EXCEPTION_IF_NULL(output);
+    output->addr = device_address->ptr_;
+    output->size = output_sizes[i];
+    kernel_outputs->emplace_back(output);
+  }
+  return true;
+}
+
+bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
+                                                      const mindspore::AnfNodePtr &kernel,
+                                                      AddressPtrList *kernel_workspaces, bool mock) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_workspaces);
+  auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
+  for (size_t i = 0; i < workspace_sizes.size(); ++i) {
+    if (workspace_sizes[i] == 0) {
+      kernel_workspaces->emplace_back(nullptr);
+      continue;
+    }
+    auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
+    if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i], mock)) {
+      return false;
+    }
+    kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
+    MS_EXCEPTION_IF_NULL(workspace);
+    workspace->addr = device_address->ptr_;
+    workspace->size = workspace_sizes[i];
+    kernel_workspaces->emplace_back(workspace);
+  }
+  return true;
+}
+
+void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto &kernels = graph->execution_order();
+  for (auto &kernel : kernels) {
+    MS_EXCEPTION_IF_NULL(kernel);
+    if (AnfAlgo::IsCommunicationOp(kernel)) {
+      AllocCommunicationOpInputDynamicRes(kernel);
+      AllocCommunicationOpOutputDynamicRes(kernel);
+    }
+  }
+}
+
+void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
+  bool is_need_alloc_memory = false;
+  bool is_need_free_memory = false;
+  size_t total_size = 0;
+  std::vector<size_t> size_list;
+  DeviceAddressPtrList addr_list;
+  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  auto intput_sizes = kernel_mod->GetInputSizeList();
+  for (size_t i = 0; i < intput_sizes.size(); ++i) {
+    DeviceAddressPtr device_address;
+    if (mem_reuse_util_->is_all_nop_node()) {
+      // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+    } else {
+      // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
+    }
+    MS_EXCEPTION_IF_NULL(device_address);
+    if (device_address->ptr_ == nullptr) {
+      is_need_alloc_memory = true;
+    } else {
+      is_need_free_memory = true;
+    }
+    total_size += intput_sizes[i];
+    size_list.emplace_back(intput_sizes[i]);
+    addr_list.emplace_back(device_address);
+  }
+  AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
+}
+
+void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  bool is_need_alloc_memory = false;
+  bool is_need_free_memory = false;
+  size_t total_size = 0;
+  std::vector<size_t> size_list;
+  DeviceAddressPtrList addr_list;
+  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  auto output_sizes = kernel_mod->GetOutputSizeList();
+  for (size_t i = 0; i < output_sizes.size(); ++i) {
+    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
+    MS_EXCEPTION_IF_NULL(device_address);
+    if (device_address->ptr_ == nullptr) {
+      is_need_alloc_memory = true;
+    } else {
+      is_need_free_memory = true;
+    }
+    total_size += output_sizes[i];
+    size_list.emplace_back(output_sizes[i]);
+    addr_list.emplace_back(device_address);
+  }
+  AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
+}
+
+void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
+                                                  const DeviceAddressPtrList addr_list, size_t total_size,
+                                                  std::vector<size_t> size_list) {
+  MS_EXCEPTION_IF_NULL(mem_manager_);
+  if (!is_need_alloc_memory) {
+    return;
+  }
+  if (is_need_free_memory) {
+    for (const auto &iter : addr_list) {
+      MS_EXCEPTION_IF_NULL(iter);
+      // Free the inputs/outputs of communication kernel which are not released.
+      if (iter->ptr_ != nullptr) {
+        mem_manager_->FreeMemFromMemPool(iter);
+      }
+    }
+  }
+  auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Malloc device memory failed.";
+  }
+}
+
+void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  MS_EXCEPTION_IF_NULL(mem_manager_);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
+  auto cnode = kernel->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (AnfAlgo::IsCommunicationOp(kernel)) {
+    return;
+  }
+  // Free the input of kernel by reference count.
+  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
+    auto kernel_ref_count_ptr = mem_reuse_util_->GetKernelInputRef(cnode, i);
+    if (kernel_ref_count_ptr == nullptr) {
+      continue;
+    }
+    kernel_ref_count_ptr->ref_count_dynamic_use_--;
+    if (kernel_ref_count_ptr->ref_count_dynamic_use_ < 0) {
+      MS_LOG(EXCEPTION) << "Check dynamic reference count failed.";
+    }
+    if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
+      DeviceAddressPtr device_address;
+      if (mem_reuse_util_->is_all_nop_node()) {
+        // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+        device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+      } else {
+        // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
+        device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
+      }
+      mem_manager_->FreeMemFromMemPool(device_address);
+      device_address->set_status(DeviceAddressStatus::kInDevice);
+    }
+  }
+  // Free the output of kernel, if output has no reference.
+  for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
+    auto kernel_ref_count_ptr = mem_reuse_util_->GetRef(cnode, i);
+    if (kernel_ref_count_ptr == nullptr) {
+      continue;
+    }
+    if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
+      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
+      mem_manager_->FreeMemFromMemPool(device_address);
+      device_address->set_status(DeviceAddressStatus::kInDevice);
+    }
+  }
+  // Free the workspace of kernel.
+  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) {
+    auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
+    MS_EXCEPTION_IF_NULL(device_address);
+    if (device_address->ptr_) {
+      mem_manager_->FreeMemFromMemPool(device_address);
+    }
+  }
+}
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
-#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
-
-#include <string>
-#include <memory>
-#include <vector>
-#include <set>
-#include <utility>
-#include <unordered_map>
-#include <unordered_set>
-#include "runtime/device/kernel_runtime.h"
-#include "runtime/device/kernel_runtime_manager.h"
-#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
-
-namespace mindspore {
-namespace device {
-namespace gpu {
-using mindspore::device::memswap::MemSwapManagerPtr;
-class GPUKernelRuntime : public KernelRuntime {
- public:
-  GPUKernelRuntime() = default;
-  ~GPUKernelRuntime() override = default;
-  bool Init() override;
-  void ReleaseDeviceRes() override;
-  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                 const std::unordered_set<ValueNodePtr> &value_nodes,
-                                 const std::vector<CNodePtr> &execution_order) override;
-  void AssignMemory(session::KernelGraph *graph) override;
-  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
-#ifdef ENABLE_DUMP_E2E
-  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
-#endif
-
- protected:
-  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
-                                       TypeId type_id) override;
-  bool SyncStream() override;
-
- private:
-  GPUKernelRuntime(const GPUKernelRuntime &);
-  GPUKernelRuntime &operator=(const GPUKernelRuntime &);
-  bool InitDevice();
-  bool device_init_{false};
-
-  // The related functions and members for using dynamic memory pool.
-  void InitKernelRefCount(const session::KernelGraph *graph);
-  void InitKernelOutputAddress(const session::KernelGraph *graph);
-  void InitKernelWorkspaceAddress(const session::KernelGraph *graph);
-  void InitMemorySwapInfo(const session::KernelGraph *graph);
-  void SaveGraphOutputNode(const session::KernelGraph *graph);
-  bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const;
-  void ClearKernelOutputAddress(const session::KernelGraph *graph);
-  void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
-  void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
-  bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr);
-  bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
-  bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
-  bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false,
-                           bool profiling = false);
-  void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
-                                     const AddressPtrList &workspace, const AddressPtrList &outputs);
-  bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);
-  bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
-                             AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces,
-                             AddressPtrList *kernel_outputs, bool mock);
-  bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock);
-  bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
-                                   AddressPtrList *kernel_outputs, bool mock);
-  bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
-                                      const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces,
-                                      bool mock);
-  void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph);
-  void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel);
-  void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel);
-  void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
-                                  const DeviceAddressPtrList addr_list, size_t total_size,
-                                  std::vector<size_t> size_list);
-  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel);
-  bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
-  bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
-  void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock);
-  void UpdateHostSwapOutQueue(bool mock);
-  void ClearSwapInfo(bool mock);
-  std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
-  std::unordered_map<uint32_t, MemSwapManagerPtr> mem_swap_map_;
-  std::unordered_map<uint32_t, bool> is_first_step_map_;
-  std::unordered_map<uint32_t, std::set<AnfNodePtr>> graph_output_map_;
-
-  MemReuseUtilPtr mem_reuse_util_{nullptr};
-  MemSwapManagerPtr mem_swap_manager_{nullptr};
-};
-MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
-}  // namespace gpu
-}  // namespace device
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
+#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
+
+#include <string>
+#include <memory>
+#include <vector>
+#include <set>
+#include <utility>
+#include <unordered_map>
+#include <unordered_set>
+#include "runtime/device/kernel_runtime.h"
+#include "runtime/device/kernel_runtime_manager.h"
+#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+using mindspore::device::memswap::MemSwapManagerPtr;
+class GPUKernelRuntime : public KernelRuntime {
+ public:
+  GPUKernelRuntime() = default;
+  ~GPUKernelRuntime() override = default;
+  bool Init() override;
+  void ReleaseDeviceRes() override;
+  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
+                                 const std::unordered_set<ValueNodePtr> &value_nodes,
+                                 const std::vector<CNodePtr> &execution_order) override;
+  void AssignMemory(session::KernelGraph *graph) override;
+  bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override;
+#ifdef ENABLE_DUMP_E2E
+  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+#endif
+
+ protected:
+  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
+                                       TypeId type_id) override;
+  bool SyncStream() override;
+
+ private:
+  GPUKernelRuntime(const GPUKernelRuntime &);
+  GPUKernelRuntime &operator=(const GPUKernelRuntime &);
+  bool InitDevice();
+  bool device_init_{false};
+
+  // The related functions and members for using dynamic memory pool.
+  void InitKernelRefCount(const session::KernelGraph *graph);
+  void InitKernelOutputAddress(const session::KernelGraph *graph);
+  void InitKernelWorkspaceAddress(const session::KernelGraph *graph);
+  void InitMemorySwapInfo(const session::KernelGraph *graph);
+  void SaveGraphOutputNode(const session::KernelGraph *graph);
+  bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const;
+  void ClearKernelOutputAddress(const session::KernelGraph *graph);
+  void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
+  void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
+  bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false,
+                           bool profiling = false);
+  void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
+                                     const AddressPtrList &workspace, const AddressPtrList &outputs);
+  bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);
+  bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
+                             AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces,
+                             AddressPtrList *kernel_outputs, bool mock);
+  bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock);
+  bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
+                                   AddressPtrList *kernel_outputs, bool mock);
+  bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
+                                      const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces,
+                                      bool mock);
+  void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph);
+  void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel);
+  void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel);
+  void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
+                                  const DeviceAddressPtrList addr_list, size_t total_size,
+                                  std::vector<size_t> size_list);
+  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel);
+  bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
+  bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
+  void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock);
+  void UpdateHostSwapOutQueue(bool mock);
+  void ClearSwapInfo(bool mock);
+  std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
+  std::unordered_map<uint32_t, MemSwapManagerPtr> mem_swap_map_;
+  std::unordered_map<uint32_t, bool> is_first_step_map_;
+  std::unordered_map<uint32_t, std::set<AnfNodePtr>> graph_output_map_;
+
+  MemReuseUtilPtr mem_reuse_util_{nullptr};
+  MemSwapManagerPtr mem_swap_manager_{nullptr};
+};
+MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -40,7 +40,7 @@ KernelRuntime::~KernelRuntime() {
 #endif
 }

-bool KernelRuntime::Load(session::KernelGraph *graph) { return true; }
+bool KernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) { return true; }

 bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  if (graph != nullptr) {

--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -59,8 +59,8 @@ class KernelRuntime {
  bool DumpDataEnabled();
  bool DumpDataEnabledIteration();
  virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
-  virtual bool Load(session::KernelGraph *graph);
-  virtual bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) = 0;
+  virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
+  virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;
  bool LaunchKernel(const session::KernelGraph *graph);
  bool LaunchTaskBasedOnSingleKernel(kernel::KernelModPtr kernel_mod_ptr, const AddressPtrList &kernel_inputs,
                                     const AddressPtrList &kernel_outputs,