Use the unified Execute function to run Graph or Single Op Graph.

c0070d3d · Zhang Qinghua · 77dd91a6 · c0070d3d · c0070d3d · c0070d3d
12 changed file
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -318,7 +318,7 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
 #endif
  {
    // run task on device
-    Execute(kernel_graph);
+    Execute(kernel_graph, true);
  }
  // summary
  Summary(kernel_graph.get());
@@ -348,17 +348,6 @@ void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelG
  MS_LOG(INFO) << "Finish";
 }

-void AscendSession::RunOpExecTask(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  MS_LOG(INFO) << "Start!";
-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  bool ret_ok = runtime_instance->LaunchKernel(kernel_graph.get());
-  if (!ret_ok) {
-    MS_LOG(EXCEPTION) << "Run task error!";
-  }
-  MS_LOG(INFO) << "Finish!";
-}
-
 bool AscendSession::GraphCacheExist(const GraphInfo &graph_info) const {
  return run_op_graphs_.find(graph_info) != run_op_graphs_.end();
 }
@@ -398,7 +387,7 @@ void AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_i
  // load input data to device
  LoadInputData(graph, input_tensors);
  // run op
-  RunOpExecTask(graph);
+  Execute(graph, false);
  // get output
  if (op_run_info.value != nullptr) {
    std::vector<tensor::TensorPtr> pre_output_tensors;
@@ -552,21 +541,30 @@ void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const {

 void AscendSession::Load(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  MS_LOG(INFO) << "Start!";
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
  (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-  bool ret_ok = runtime_instance->Load(kernel_graph.get());
+  bool ret_ok = runtime_instance->Load(kernel_graph.get(), is_task_sink);
  if (!ret_ok) {
    MS_LOG(EXCEPTION) << "Load task error!";
  }
  MS_LOG(INFO) << "Finish!";
 }

-void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const {
  MS_LOG(INFO) << "Start!";
+  bool is_task_sink = false;
+  if (is_task) {
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
+  }
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-  bool ret_ok = runtime_instance->Run(kernel_graph.get());
+  bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink);
  if (!ret_ok) {
    MS_LOG(EXCEPTION) << "run task error!";
  }

--- a/mindspore/ccsrc/backend/session/ascend_session.h
+++ b/mindspore/ccsrc/backend/session/ascend_session.h
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 #ifndef MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
 #define MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
+
 #include <unordered_map>
 #include <string>
 #include <memory>
@@ -82,13 +84,12 @@ class AscendSession : public SessionBasic {
                        KernelGraph *kernel_graph) const;
  void RunOpMemoryClear(const KernelGraph *kernel_graph) const;
  void Load(const std::shared_ptr<KernelGraph> &kernel_graph) const;
-  void Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+  void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
  void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
  void DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs);
  void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
  // below functions are used for run op
  void RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const;
-  void RunOpExecTask(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  static void BackendOptimization(const std::vector<KernelGraphPtr> &all_graphs);
  static void LinkChildGraphs(NotNull<KernelGraphPtr> graph);

--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -118,7 +118,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
    debugger_->PreExecute(kernel_graph);
  }
 #endif
-  bool ret = runtime_.Run(kernel_graph.get());
+  bool ret = runtime_.Run(kernel_graph.get(), false);
  if (!ret) {
    MS_LOG(EXCEPTION) << "Run graph failed";
  }

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -191,9 +191,9 @@ void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
 #ifdef ENABLE_DEBUGGER
-  if (!runtime_instance->Run(kernel_graph.get(), debugger_.get())) {
+  if (!runtime_instance->Run(kernel_graph.get(), false, debugger_.get())) {
 #else
-  if (!runtime_instance->Run(kernel_graph.get())) {
+  if (!runtime_instance->Run(kernel_graph.get(), false)) {
 #endif
    MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  }

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -454,10 +454,7 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size
  return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
 }

-bool AscendKernelRuntime::Load(session::KernelGraph *graph) {
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
+bool AscendKernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) {
  if (!is_task_sink) {
    return true;
  }
@@ -609,17 +606,14 @@ void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) {
  }
 }

-bool AscendKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
+bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) {
  bool ret = false;
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
 #if defined(_WIN32) || defined(_WIN64)
  auto start_time = std::chrono::steady_clock::now();
 #else
  struct timeval start_time, end_time;
  (void)gettimeofday(&start_time, nullptr);
 #endif
-  bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
  if (is_task_sink) {
    ret = RunTask(graph);
  } else {

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -44,8 +44,8 @@ class AscendKernelRuntime : public KernelRuntime {
  bool GenTask(const session::KernelGraph *graph);
  bool LoadTask(const session::KernelGraph *graph);
  bool RunTask(const session::KernelGraph *graph);
-  bool Load(session::KernelGraph *graph) override;
-  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+  bool Load(session::KernelGraph *graph, bool is_task_sink) override;
+  bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override;
  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
                                 const std::unordered_set<ValueNodePtr> &value_nodes,
                                 const std::vector<CNodePtr> &execution_order) override;

--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
@@ -287,7 +287,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput
  resource_manager_.DecreaseSummaryRefCount(summary_outputs);
 }

-bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, Debugger *debugger) {
+bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink, Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  resource_manager_.IncreaseAddressRefCount(kernel_graph);


--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime {
  ~CPUKernelRuntime() override = default;

  bool Init() override { return true; }
-  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+  bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override;
  void AssignKernelAddress(session::KernelGraph *kernel_graph);
  void BindInputOutput(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                       VectorRef *outputs);

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
-#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
-
-#include <string>
-#include <memory>
-#include <vector>
-#include <set>
-#include <utility>
-#include <unordered_map>
-#include <unordered_set>
-#include "runtime/device/kernel_runtime.h"
-#include "runtime/device/kernel_runtime_manager.h"
-#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
-
-namespace mindspore {
-namespace device {
-namespace gpu {
-using mindspore::device::memswap::MemSwapManagerPtr;
-class GPUKernelRuntime : public KernelRuntime {
- public:
-  GPUKernelRuntime() = default;
-  ~GPUKernelRuntime() override = default;
-  bool Init() override;
-  void ReleaseDeviceRes() override;
-  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                 const std::unordered_set<ValueNodePtr> &value_nodes,
-                                 const std::vector<CNodePtr> &execution_order) override;
-  void AssignMemory(session::KernelGraph *graph) override;
-  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
-#ifdef ENABLE_DUMP_E2E
-  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
-#endif
-
- protected:
-  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
-                                       TypeId type_id) override;
-  bool SyncStream() override;
-
- private:
-  GPUKernelRuntime(const GPUKernelRuntime &);
-  GPUKernelRuntime &operator=(const GPUKernelRuntime &);
-  bool InitDevice();
-  bool device_init_{false};
-
-  // The related functions and members for using dynamic memory pool.
-  void InitKernelRefCount(const session::KernelGraph *graph);
-  void InitKernelOutputAddress(const session::KernelGraph *graph);
-  void InitKernelWorkspaceAddress(const session::KernelGraph *graph);
-  void InitMemorySwapInfo(const session::KernelGraph *graph);
-  void SaveGraphOutputNode(const session::KernelGraph *graph);
-  bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const;
-  void ClearKernelOutputAddress(const session::KernelGraph *graph);
-  void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
-  void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
-  bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr);
-  bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
-  bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
-  bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false,
-                           bool profiling = false);
-  void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
-                                     const AddressPtrList &workspace, const AddressPtrList &outputs);
-  bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);
-  bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
-                             AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces,
-                             AddressPtrList *kernel_outputs, bool mock);
-  bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock);
-  bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
-                                   AddressPtrList *kernel_outputs, bool mock);
-  bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
-                                      const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces,
-                                      bool mock);
-  void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph);
-  void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel);
-  void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel);
-  void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
-                                  const DeviceAddressPtrList addr_list, size_t total_size,
-                                  std::vector<size_t> size_list);
-  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel);
-  bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
-  bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
-  void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock);
-  void UpdateHostSwapOutQueue(bool mock);
-  void ClearSwapInfo(bool mock);
-  std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
-  std::unordered_map<uint32_t, MemSwapManagerPtr> mem_swap_map_;
-  std::unordered_map<uint32_t, bool> is_first_step_map_;
-  std::unordered_map<uint32_t, std::set<AnfNodePtr>> graph_output_map_;
-
-  MemReuseUtilPtr mem_reuse_util_{nullptr};
-  MemSwapManagerPtr mem_swap_manager_{nullptr};
-};
-MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
-}  // namespace gpu
-}  // namespace device
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
+#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
+
+#include <string>
+#include <memory>
+#include <vector>
+#include <set>
+#include <utility>
+#include <unordered_map>
+#include <unordered_set>
+#include "runtime/device/kernel_runtime.h"
+#include "runtime/device/kernel_runtime_manager.h"
+#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+using mindspore::device::memswap::MemSwapManagerPtr;
+class GPUKernelRuntime : public KernelRuntime {
+ public:
+  GPUKernelRuntime() = default;
+  ~GPUKernelRuntime() override = default;
+  bool Init() override;
+  void ReleaseDeviceRes() override;
+  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
+                                 const std::unordered_set<ValueNodePtr> &value_nodes,
+                                 const std::vector<CNodePtr> &execution_order) override;
+  void AssignMemory(session::KernelGraph *graph) override;
+  bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override;
+#ifdef ENABLE_DUMP_E2E
+  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
+#endif
+
+ protected:
+  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
+                                       TypeId type_id) override;
+  bool SyncStream() override;
+
+ private:
+  GPUKernelRuntime(const GPUKernelRuntime &);
+  GPUKernelRuntime &operator=(const GPUKernelRuntime &);
+  bool InitDevice();
+  bool device_init_{false};
+
+  // The related functions and members for using dynamic memory pool.
+  void InitKernelRefCount(const session::KernelGraph *graph);
+  void InitKernelOutputAddress(const session::KernelGraph *graph);
+  void InitKernelWorkspaceAddress(const session::KernelGraph *graph);
+  void InitMemorySwapInfo(const session::KernelGraph *graph);
+  void SaveGraphOutputNode(const session::KernelGraph *graph);
+  bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const;
+  void ClearKernelOutputAddress(const session::KernelGraph *graph);
+  void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
+  void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
+  bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
+  bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false,
+                           bool profiling = false);
+  void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
+                                     const AddressPtrList &workspace, const AddressPtrList &outputs);
+  bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);
+  bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
+                             AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces,
+                             AddressPtrList *kernel_outputs, bool mock);
+  bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock);
+  bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
+                                   AddressPtrList *kernel_outputs, bool mock);
+  bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
+                                      const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces,
+                                      bool mock);
+  void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph);
+  void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel);
+  void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel);
+  void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
+                                  const DeviceAddressPtrList addr_list, size_t total_size,
+                                  std::vector<size_t> size_list);
+  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel);
+  bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
+  bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
+  void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock);
+  void UpdateHostSwapOutQueue(bool mock);
+  void ClearSwapInfo(bool mock);
+  std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
+  std::unordered_map<uint32_t, MemSwapManagerPtr> mem_swap_map_;
+  std::unordered_map<uint32_t, bool> is_first_step_map_;
+  std::unordered_map<uint32_t, std::set<AnfNodePtr>> graph_output_map_;
+
+  MemReuseUtilPtr mem_reuse_util_{nullptr};
+  MemSwapManagerPtr mem_swap_manager_{nullptr};
+};
+MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -40,7 +40,7 @@ KernelRuntime::~KernelRuntime() {
 #endif
 }

-bool KernelRuntime::Load(session::KernelGraph *graph) { return true; }
+bool KernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) { return true; }

 bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  if (graph != nullptr) {

--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -59,8 +59,8 @@ class KernelRuntime {
  bool DumpDataEnabled();
  bool DumpDataEnabledIteration();
  virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
-  virtual bool Load(session::KernelGraph *graph);
-  virtual bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) = 0;
+  virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
+  virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;
  bool LaunchKernel(const session::KernelGraph *graph);
  bool LaunchTaskBasedOnSingleKernel(kernel::KernelModPtr kernel_mod_ptr, const AddressPtrList &kernel_inputs,
                                     const AddressPtrList &kernel_outputs,