From 3ec4034215f20fd091ffa109b11b08ce59f4ce8f Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Fri, 7 Dec 2018 18:05:42 +0800
Subject: [PATCH] Bug: Replace OPENCLRuntime with GPURuntime in GPUDevice.

1. Add GPURuntime for GPUDevice
2. Move OpenCLRuntime to GPURuntime
3. Move ScratchImageManager from OPENCLRuntime to GPURuntime
---
 mace/core/device.cc                           |  4 +-
 mace/core/device.h                            |  6 +--
 mace/core/runtime/opencl/gpu_device.cc        |  7 +--
 mace/core/runtime/opencl/gpu_device.h         |  4 +-
 mace/core/runtime/opencl/gpu_runtime.cc       | 45 +++++++++++++++++++
 mace/core/runtime/opencl/gpu_runtime.h        | 45 +++++++++++++++++++
 mace/core/runtime/opencl/opencl_runtime.cc    | 16 +------
 mace/core/runtime/opencl/opencl_runtime.h     |  6 ---
 mace/core/workspace.cc                        |  2 +-
 mace/libmace/mace.cc                          |  8 ++--
 mace/ops/activation.cc                        |  2 +-
 mace/ops/addn.cc                              |  2 +-
 mace/ops/batch_norm.cc                        |  2 +-
 mace/ops/batch_to_space.cc                    |  2 +-
 mace/ops/bias_add.cc                          |  2 +-
 mace/ops/channel_shuffle.cc                   |  2 +-
 mace/ops/concat.cc                            |  2 +-
 mace/ops/conv_2d.cc                           |  4 +-
 mace/ops/crop.cc                              |  2 +-
 mace/ops/deconv_2d.cc                         |  2 +-
 mace/ops/depth_to_space.cc                    |  2 +-
 mace/ops/depthwise_conv2d.cc                  |  2 +-
 mace/ops/depthwise_deconv2d.cc                |  2 +-
 mace/ops/eltwise.cc                           |  2 +-
 mace/ops/fully_connected.cc                   |  2 +-
 mace/ops/lstm_cell.cc                         |  2 +-
 mace/ops/opencl/buffer/buffer_transform.cc    |  6 +--
 .../opencl/buffer/buffer_type_transform.cc    |  2 +-
 mace/ops/opencl/buffer/conv_2d_1x1.cc         |  2 +-
 mace/ops/opencl/buffer/conv_2d_general.cc     |  2 +-
 mace/ops/opencl/buffer/depthwise_conv2d.cc    |  2 +-
 mace/ops/opencl/buffer/pooling.h              |  2 +-
 mace/ops/opencl/buffer/softmax.h              |  2 +-
 mace/ops/opencl/buffer/utils.cc               |  2 +-
 mace/ops/opencl/image/activation.h            |  2 +-
 mace/ops/opencl/image/addn.h                  |  2 +-
 mace/ops/opencl/image/batch_norm.h            |  2 +-
 mace/ops/opencl/image/batch_to_space.h        |  2 +-
 mace/ops/opencl/image/bias_add.h              |  2 +-
 mace/ops/opencl/image/buffer_to_image.h       |  2 +-
 mace/ops/opencl/image/channel_shuffle.h       |  2 +-
 mace/ops/opencl/image/concat.cc               |  4 +-
 mace/ops/opencl/image/conv_2d_1x1.cc          |  2 +-
 mace/ops/opencl/image/conv_2d_3x3.cc          |  2 +-
 mace/ops/opencl/image/conv_2d_general.cc      |  2 +-
 mace/ops/opencl/image/crop.h                  |  2 +-
 mace/ops/opencl/image/deconv_2d.h             |  2 +-
 mace/ops/opencl/image/depth_to_space.h        |  2 +-
 mace/ops/opencl/image/depthwise_conv2d.cc     |  2 +-
 mace/ops/opencl/image/depthwise_deconv2d.h    |  2 +-
 mace/ops/opencl/image/eltwise.h               |  2 +-
 mace/ops/opencl/image/fully_connected.h       |  2 +-
 mace/ops/opencl/image/image_to_buffer.h       |  2 +-
 mace/ops/opencl/image/lstm_cell.h             |  2 +-
 mace/ops/opencl/image/matmul.h                |  2 +-
 mace/ops/opencl/image/pad.h                   |  2 +-
 mace/ops/opencl/image/pooling.h               |  2 +-
 mace/ops/opencl/image/reduce_mean.h           |  2 +-
 mace/ops/opencl/image/resize_bicubic.h        |  2 +-
 mace/ops/opencl/image/resize_bilinear.h       |  2 +-
 mace/ops/opencl/image/softmax.h               |  2 +-
 mace/ops/opencl/image/space_to_batch.h        |  2 +-
 mace/ops/opencl/image/space_to_depth.h        |  2 +-
 mace/ops/opencl/image/split.h                 |  2 +-
 mace/ops/opencl/image/sqrdiff_mean.h          |  2 +-
 mace/ops/opencl/image/winograd_conv2d.cc      |  9 ++--
 mace/ops/opencl/out_of_range_check_test.cc    |  2 +-
 mace/ops/ops_test_util.cc                     |  6 +--
 mace/ops/pad.cc                               |  2 +-
 mace/ops/pooling.cc                           |  2 +-
 mace/ops/reduce_mean.cc                       |  2 +-
 mace/ops/resize_bicubic.cc                    |  2 +-
 mace/ops/resize_bilinear.cc                   |  2 +-
 mace/ops/softmax.cc                           |  2 +-
 mace/ops/space_to_batch.cc                    |  2 +-
 mace/ops/space_to_depth.cc                    |  2 +-
 mace/ops/split.cc                             |  2 +-
 mace/ops/sqrdiff_mean.cc                      |  2 +-
 78 files changed, 186 insertions(+), 112 deletions(-)
 create mode 100644 mace/core/runtime/opencl/gpu_runtime.cc
 create mode 100644 mace/core/runtime/opencl/gpu_runtime.h

diff --git a/mace/core/device.cc b/mace/core/device.cc
index 35e8c7af..4eb547c2 100644
--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -33,8 +33,8 @@ CPURuntime *CPUDevice::cpu_runtime() {
 }
 
 #ifdef MACE_ENABLE_OPENCL
-OpenCLRuntime *CPUDevice::opencl_runtime() {
-  LOG(FATAL) << "CPU device should not call OpenCL Runtime";
+GPURuntime *CPUDevice::gpu_runtime() {
+  LOG(FATAL) << "CPU device should not call GPU Runtime";
   return nullptr;
 }
 #endif
diff --git a/mace/core/device.h b/mace/core/device.h
index b7fe5f32..627d46be 100644
--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -21,7 +21,7 @@
 #include "mace/core/allocator.h"
 
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/core/runtime/opencl/gpu_runtime.h"
 #endif
 
 namespace mace {
@@ -33,7 +33,7 @@ class Device {
   virtual ~Device() {}
 
 #ifdef MACE_ENABLE_OPENCL
-  virtual OpenCLRuntime *opencl_runtime() = 0;
+  virtual GPURuntime *gpu_runtime() = 0;
 #endif  // MACE_ENABLE_OPENCL
   virtual CPURuntime *cpu_runtime() = 0;
 
@@ -50,7 +50,7 @@ class CPUDevice : public Device {
   virtual ~CPUDevice();
 
 #ifdef MACE_ENABLE_OPENCL
-  OpenCLRuntime *opencl_runtime() override;
+  GPURuntime *gpu_runtime() override;
 #endif
   CPURuntime *cpu_runtime() override;
 
diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc
index 09bb9181..caea5767 100644
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -30,12 +30,13 @@ GPUDevice::GPUDevice(std::shared_ptr<Tuner<uint32_t>> tuner,
     runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
                                opencl_binary_storage, tuner)),
     allocator_(new OpenCLAllocator(runtime_.get())),
-    scratch_buffer_(new ScratchBuffer(allocator_.get())) {}
+    scratch_buffer_(new ScratchBuffer(allocator_.get())),
+    gpu_runtime_(new GPURuntime(runtime_.get())) {}
 
 GPUDevice::~GPUDevice() = default;
 
-OpenCLRuntime* GPUDevice::opencl_runtime() {
-  return runtime_.get();
+GPURuntime* GPUDevice::gpu_runtime() {
+  return gpu_runtime_.get();
 }
 
 Allocator *GPUDevice::allocator() {
diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h
index 1d36461b..d3c7d98e 100644
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -19,6 +19,7 @@
 
 #include "mace/core/device_context.h"
 #include "mace/core/device.h"
+#include "mace/core/runtime/opencl/gpu_runtime.h"
 #include "mace/core/runtime/opencl/opencl_allocator.h"
 
 namespace mace {
@@ -34,7 +35,7 @@ class GPUDevice : public CPUDevice {
             CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE,
             bool use_gemmlowp = false);
   ~GPUDevice();
-  OpenCLRuntime *opencl_runtime() override;
+  GPURuntime *gpu_runtime() override;
   Allocator *allocator() override;
   DeviceType device_type() const override;
   ScratchBuffer *scratch_buffer() override;
@@ -42,6 +43,7 @@ class GPUDevice : public CPUDevice {
   std::unique_ptr<OpenCLRuntime> runtime_;
   std::unique_ptr<OpenCLAllocator> allocator_;
   std::unique_ptr<ScratchBuffer> scratch_buffer_;
+  std::unique_ptr<GPURuntime> gpu_runtime_;
 };
 
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/gpu_runtime.cc b/mace/core/runtime/opencl/gpu_runtime.cc
new file mode 100644
index 00000000..8574ad48
--- /dev/null
+++ b/mace/core/runtime/opencl/gpu_runtime.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/gpu_runtime.h"
+
+#include "mace/core/runtime/opencl/scratch_image.h"
+
+namespace mace {
+
+GPURuntime::GPURuntime(mace::OpenCLRuntime *runtime)
+    : runtime_(runtime),
+      scratch_image_manager_(new ScratchImageManager),
+      mem_type_(MemoryType::GPU_IMAGE) {}
+
+GPURuntime::~GPURuntime() = default;
+
+OpenCLRuntime* GPURuntime::opencl_runtime() {
+  return runtime_;
+}
+
+ScratchImageManager* GPURuntime::scratch_image_manager() const {
+  return scratch_image_manager_.get();
+}
+
+bool GPURuntime::UseImageMemory() {
+  return this->mem_type_ == MemoryType::GPU_IMAGE;
+}
+
+void GPURuntime::set_mem_type(MemoryType type) {
+  this->mem_type_ = type;
+}
+
+
+}  // namespace mace
diff --git a/mace/core/runtime/opencl/gpu_runtime.h b/mace/core/runtime/opencl/gpu_runtime.h
new file mode 100644
index 00000000..fee776ed
--- /dev/null
+++ b/mace/core/runtime/opencl/gpu_runtime.h
@@ -0,0 +1,45 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_RUNTIME_H_
+#define MACE_CORE_RUNTIME_OPENCL_GPU_RUNTIME_H_
+
+#include <memory>
+
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+
+class OpenCLRuntime;
+class ScratchImageManager;
+
+class GPURuntime {
+ public:
+  explicit GPURuntime(OpenCLRuntime *runtime);
+  ~GPURuntime();
+  OpenCLRuntime *opencl_runtime();
+  ScratchImageManager *scratch_image_manager() const;
+
+  // TODO(liuqi): remove this function in the future, make decision at runtime.
+  bool UseImageMemory();
+  void set_mem_type(MemoryType type);
+
+ private:
+  OpenCLRuntime *runtime_;
+  std::unique_ptr<ScratchImageManager> scratch_image_manager_;
+  MemoryType mem_type_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_GPU_RUNTIME_H_
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index b552c65a..904e74f6 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -284,9 +284,7 @@ OpenCLRuntime::OpenCLRuntime(
     is_opencl_avaliable_(false),
     is_profiling_enabled_(false),
     opencl_version_(CL_VER_UNKNOWN),
-    gpu_type_(UNKNOWN),
-    mem_type_(MemoryType::GPU_IMAGE),
-    scratch_image_manager_(new ScratchImageManager) {
+    gpu_type_(UNKNOWN) {
   std::vector<cl::Platform> all_platforms;
   cl::Platform::get(&all_platforms);
   if (all_platforms.size() == 0) {
@@ -471,14 +469,6 @@ uint32_t OpenCLRuntime::device_compute_units() const {
   return device_compute_units_;
 }
 
-bool OpenCLRuntime::UseImageMemory() {
-  return this->mem_type_ == MemoryType::GPU_IMAGE;
-}
-
-void OpenCLRuntime::set_mem_type(MemoryType type) {
-  this->mem_type_ = type;
-}
-
 bool OpenCLRuntime::BuildProgramFromCache(
     const std::string &built_program_key,
     const std::string &build_options_str,
@@ -792,8 +782,4 @@ bool OpenCLRuntime::is_profiling_enabled() const {
   return is_profiling_enabled_;
 }
 
-ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
-  return scratch_image_manager_.get();
-}
-
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 546b8008..1e189b8e 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -83,11 +83,7 @@ class OpenCLRuntime {
   uint64_t device_global_mem_cache_size() const;
   uint32_t device_compute_units() const;
   Tuner<uint32_t> *tuner();
-  ScratchImageManager *scratch_image_manager() const;
   bool is_opencl_avaliable();
-  // TODO(liuqi): remove this function in the future, make decision at runtime.
-  bool UseImageMemory();
-  void set_mem_type(MemoryType type);
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint64_t GetDeviceMaxWorkGroupSize();
@@ -135,8 +131,6 @@ class OpenCLRuntime {
   bool is_profiling_enabled_;
   OpenCLVersion opencl_version_;
   GPUType gpu_type_;
-  MemoryType mem_type_;
-  std::unique_ptr<ScratchImageManager> scratch_image_manager_;
   // All OpenCL object must be a pointer and manually deleted before unloading
   // OpenCL library.
   std::shared_ptr<cl::Context> context_;
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index bbef2c5d..5123e670 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -109,7 +109,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
             (!is_quantize_model && HasQuantizedTensor(net_def))));
 #ifdef MACE_ENABLE_OPENCL
     diffused_buffer_ = diffused_buffer_ || (device_type == DeviceType::GPU &&
-        device->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
+        device->gpu_runtime()->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
             static_cast<uint64_t>(model_data_size));
 #endif
     if (diffused_buffer_) {
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 9244d62a..bcaff34d 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -69,8 +69,8 @@ void UnloadModelData(const unsigned char *model_data,
 #ifdef MACE_ENABLE_OPENCL
 MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
   // Check OpenCL avaliable
-  auto runtime = device->opencl_runtime();
-  if (!runtime->is_opencl_avaliable()) {
+  auto runtime = device->gpu_runtime();
+  if (!runtime->opencl_runtime()->is_opencl_avaliable()) {
     LOG(WARNING) << "The device does not support OpenCL";
     return MaceStatus::MACE_OUT_OF_RESOURCES;
   }
@@ -678,8 +678,8 @@ MaceStatus MaceEngine::Impl::Run(
 
 #ifdef MACE_ENABLE_OPENCL
   if (device_type_ == GPU) {
-    device_->opencl_runtime()->command_queue().finish();
-    device_->opencl_runtime()->SaveBuiltCLProgram();
+    device_->gpu_runtime()->opencl_runtime()->command_queue().finish();
+    device_->gpu_runtime()->opencl_runtime()->SaveBuiltCLProgram();
   }
 #endif
   for (auto &output : *outputs) {
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index b904b5c2..fe8862bb 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -81,7 +81,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
     auto relux_max_limit = static_cast<T>(
         Operation::GetOptionalArg<float>("max_limit", 0.0f));
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(
           new opencl::image::ActivationKernel<T>(type, relux_max_limit));
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index 4040de1f..0fe0c7b4 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -106,7 +106,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit AddNOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::AddNKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 1758f79b..3ca5592a 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -149,7 +149,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
         Operation::GetOptionalArg<std::string>("activation", "NOOP"));
     float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::BatchNormKernel<T>(
           epsilon, activation, relux_max_limit));
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index 5cc6a1e0..3aa5acec 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -265,7 +265,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
  public:
   explicit BatchToSpaceNDOp(OpConstructContext *context)
       : BatchToSpaceOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 59579fa5..9190cf95 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -101,7 +101,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
         data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
             "data_format", NHWC))) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::BiasAddKernel<T>);
     } else {
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index 04c6a88d..d4404c61 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -84,7 +84,7 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
   explicit ChannelShuffleOp(OpConstructContext *context)
       : Operation(context) {
     const int groups = Operation::GetOptionalArg<int>("group", 1);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index eec11e0b..3fa5ef2c 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -196,7 +196,7 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
  public:
   explicit ConcatOp(OpConstructContext *context)
       : ConcatOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ConcatKernel<T>(axis_));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index a5cbec74..0a0d3bb5 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -963,7 +963,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
         wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::Conv2dKernel<T>);
     } else {
@@ -974,7 +974,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
     // Transform filter tensor to target format
     if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
         (kernel_->CheckUseWinograd(
-          context->device()->opencl_runtime(),
+          context->device()->gpu_runtime()->opencl_runtime(),
           context->workspace()->GetTensor(
               operator_def_->input(1))->shape(),
           std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
index b056f21c..7b705069 100644
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -113,7 +113,7 @@ class CropOp<DeviceType::GPU, T> : public Operation {
   explicit CropOp(OpConstructContext *context)
       : Operation(context) {
     const int axis = Operation::GetOptionalArg<int>("axis", 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::CropKernel<T>(
           axis, Operation::GetRepeatedArgs<int>("offset")));
     } else {
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 5697c841..575e81ad 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -360,7 +360,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
   explicit Deconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::Deconv2dKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index e18cc106..ee06075a 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -96,7 +96,7 @@ class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
   explicit DepthToSpaceOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 8a85ab46..2f849ef7 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -492,7 +492,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
   explicit DepthwiseConv2dOp(OpConstructContext *context)
       : DepthwiseConv2dOpBase(context) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
     } else {
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index 3f10a514..a4e7148e 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -410,7 +410,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
   explicit DepthwiseDeconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 863b69ed..1a2e0908 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1088,7 +1088,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
             "scalar_input_index", 1);
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::EltwiseKernel<T>(
           type, coeff, scalar_input, scalar_input_index));
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index ef919d92..31b1fb05 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -194,7 +194,7 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
   explicit FullyConnectedOp(OpConstructContext *context)
       : FullyConnectedOpBase(context) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
     } else {
diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc
index dfbfa155..bc34b969 100644
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -34,7 +34,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
         Operation::GetOptionalArg<float>("scalar_input",
                                          0.0));
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/opencl/buffer/buffer_transform.cc b/mace/ops/opencl/buffer/buffer_transform.cc
index 9ba3f81d..5bfc5389 100644
--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
@@ -47,7 +47,7 @@ MaceStatus TransformConv2DFilter(
   MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
   output->Reshape(input->shape());
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
@@ -116,7 +116,7 @@ MaceStatus TransformDWConv2DFilter(
   MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
   output->Reshape(input->shape());
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
@@ -173,7 +173,7 @@ MaceStatus TransformArgument(
   MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
   output->Reshape(input->shape());
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc
index ce405e9f..75728379 100644
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -31,7 +31,7 @@ MaceStatus BufferTypeTransform(
     Tensor *output) {
   MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
 
   const uint32_t gws =
diff --git a/mace/ops/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc
index 62e77b17..abe7d93b 100644
--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -43,7 +43,7 @@ MaceStatus Conv2d1x1(OpContext *context,
   const index_t in_height = padded_input->dim(1);
   const index_t in_width = padded_input->dim(2);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc
index f9cc804d..e8ac509c 100644
--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -48,7 +48,7 @@ MaceStatus Conv2dGeneral(OpContext *context,
   const index_t filter_height = filter->dim(2);
   const index_t filter_width = filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.cc b/mace/ops/opencl/buffer/depthwise_conv2d.cc
index 0ba4526c..d2c33599 100644
--- a/mace/ops/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc
@@ -48,7 +48,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
   const index_t filter_height = filter->dim(2);
   const index_t filter_width = filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h
index 4684d687..de7d7610 100644
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -92,7 +92,7 @@ MaceStatus PoolingKernel<T>::Compute(
   bool input_changed = !IsVecEqual(input_shape_, input->shape());
   input_shape_ = input->shape();
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
 
   // pad input
   std::vector<index_t> padded_input_shape = input->shape();
diff --git a/mace/ops/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h
index 3147a935..248ee0c8 100644
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -75,7 +75,7 @@ MaceStatus SoftmaxKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/utils.cc b/mace/ops/opencl/buffer/utils.cc
index b4214a0a..141a96b7 100644
--- a/mace/ops/opencl/buffer/utils.cc
+++ b/mace/ops/opencl/buffer/utils.cc
@@ -47,7 +47,7 @@ MaceStatus PadInput(OpContext *context,
       static_cast<uint32_t>(padded_height * batch)
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/activation.h b/mace/ops/opencl/image/activation.h
index 93944b5b..80713c36 100644
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -66,7 +66,7 @@ MaceStatus ActivationKernel<T>::Compute(
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h
index 7692ac06..48f6d8f8 100644
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -57,7 +57,7 @@ MaceStatus AddNKernel<T>::Compute(
   const index_t width = input_tensors[0]->dim(2);
   const index_t channels = input_tensors[0]->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   for (size_t i = 1; i < size; ++i) {
diff --git a/mace/ops/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h
index 5685c514..68908830 100644
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -85,7 +85,7 @@ MaceStatus BatchNormKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h
index 9d918026..35281c70 100644
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -68,7 +68,7 @@ MaceStatus BatchToSpaceKernel<T>::Compute(
       chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
       static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h
index 25e2392e..a37ee2b1 100644
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -62,7 +62,7 @@ MaceStatus BiasAddKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
index 14a0ae4b..6ff3284e 100644
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -98,7 +98,7 @@ MaceStatus BufferToImage<T>::Compute(
     }
   }
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h
index 53acbf15..f890c0c3 100644
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -70,7 +70,7 @@ MaceStatus ChannelShuffleKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
 
   MACE_OUT_OF_RANGE_DEFINITION;
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/concat.cc b/mace/ops/opencl/image/concat.cc
index aab72c54..5dfe666e 100644
--- a/mace/ops/opencl/image/concat.cc
+++ b/mace/ops/opencl/image/concat.cc
@@ -65,7 +65,7 @@ MaceStatus Concat2(OpContext *context,
       static_cast<uint32_t>(batch * height),
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
@@ -126,7 +126,7 @@ MaceStatus ConcatN(OpContext *context,
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
index f88882ee..57be0750 100644
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -95,7 +95,7 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
   const index_t width_blocks = RoundUpDiv4(width);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index 3e5aee90..f7905a0c 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -83,7 +83,7 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index 120a3daa..28bdea6c 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -91,7 +91,7 @@ extern MaceStatus Conv2d(OpContext *context,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv4(width);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
index c8f98a4c..a83349c4 100644
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -141,7 +141,7 @@ MaceStatus CropKernel<T>::Compute(
       static_cast<uint32_t>(output->dim(0) * output->dim(1))
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
index f3d6cbe9..a8dd9c26 100644
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -92,7 +92,7 @@ MaceStatus Deconv2dKernel<T>::Compute(
   const int align_w = stride_w - 1 - padding_w;
   const int kernel_size = filter->dim(2) * filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h
index 77c4bd53..1783b813 100644
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
       static_cast<uint32_t>(output_width),
       static_cast<uint32_t>(output_height * batch)
   };
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/depthwise_conv2d.cc b/mace/ops/opencl/image/depthwise_conv2d.cc
index 02409ebe..57a4415e 100644
--- a/mace/ops/opencl/image/depthwise_conv2d.cc
+++ b/mace/ops/opencl/image/depthwise_conv2d.cc
@@ -93,7 +93,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h
index 96fdfa51..d07a1649 100644
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -98,7 +98,7 @@ MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
   const int align_w = stride_w - 1 - padding_w;
   const int kernel_size = filter->dim(2) * filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
index 2afb3342..9600d501 100644
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -117,7 +117,7 @@ MaceStatus EltwiseKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(batch_height_pixels)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h
index 962ffaf0..d52e927f 100644
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -64,7 +64,7 @@ MaceStatus FullyConnectedKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
index 6ca73fa6..f9c3b011 100644
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -92,7 +92,7 @@ MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
       break;
   }
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h
index 546b4a79..265f2e10 100644
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -71,7 +71,7 @@ MaceStatus LSTMCellKernel<T>::Compute(
   const index_t hidden_units = pre_output->dim(1);
   const index_t w_blocks = hidden_units >> 2;
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h
index 763082f6..1681a8f8 100644
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -82,7 +82,7 @@ MaceStatus MatMulKernel<T>::Compute(
       static_cast<uint32_t>(height_blocks * batch),
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h
index cb0c390b..8d1cae3e 100644
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -80,7 +80,7 @@ MaceStatus PadKernel<T>::Compute(
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h
index f246efa4..1af67740 100644
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -112,7 +112,7 @@ MaceStatus PoolingKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/reduce_mean.h b/mace/ops/opencl/image/reduce_mean.h
index 95b51d86..3280691c 100644
--- a/mace/ops/opencl/image/reduce_mean.h
+++ b/mace/ops/opencl/image/reduce_mean.h
@@ -76,7 +76,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h
index bf5bfcf1..bf72ee78 100644
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -102,7 +102,7 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h
index b3f1b09c..1eb599c9 100644
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -107,7 +107,7 @@ MaceStatus ResizeBilinearKernel<T>::Compute(
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h
index ffd5ec89..a19d9483 100644
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -102,7 +102,7 @@ MaceStatus SoftmaxKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h
index f2baaba4..c2190c68 100644
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -66,7 +66,7 @@ MaceStatus SpaceToBatchKernel<T>::Compute(
       chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
       static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h
index e225b376..1df75ef8 100644
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -79,7 +79,7 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
                               &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h
index 7b7f7905..d0427a4f 100644
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -70,7 +70,7 @@ MaceStatus SplitKernel<T>::Compute(
         output_list[i]->ResizeImage(output_shape, image_shape));
   }
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h
index d0c217fe..ba84a5ef 100644
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -72,7 +72,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
index a9bd7171..8d684e59 100644
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -37,7 +37,7 @@ MaceStatus WinogradInputTransform(OpContext *context,
                                   Tensor *output_tensor,
                                   uint32_t *kwg_size,
                                   StatsFuture *future) {
-  OpenCLRuntime *runtime = context->device()->opencl_runtime();
+  OpenCLRuntime *runtime = context->device()->gpu_runtime()->opencl_runtime();
   const index_t out_width = output_tensor->dim(2);
 
   MACE_OUT_OF_RANGE_DEFINITION;
@@ -119,7 +119,7 @@ MaceStatus WinogradOutputTransform(OpContext *context,
                                    Tensor *output_tensor,
                                    uint32_t *kwg_size,
                                    StatsFuture *future) {
-  OpenCLRuntime *runtime = context->device()->opencl_runtime();
+  OpenCLRuntime *runtime = context->device()->gpu_runtime()->opencl_runtime();
   auto &output_shape = output_tensor->shape();
 
   MACE_OUT_OF_RANGE_DEFINITION;
@@ -227,8 +227,9 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                        std::vector<index_t> *prev_input_shape,
                                        Tensor *output,
                                        uint32_t *kwg_size[3]) {
-  OpenCLRuntime *runtime = context->device()->opencl_runtime();
-  ScratchImageManager *scratch_manager = runtime->scratch_image_manager();
+  OpenCLRuntime *runtime = context->device()->gpu_runtime()->opencl_runtime();
+  ScratchImageManager *scratch_manager =
+      context->device()->gpu_runtime()->scratch_image_manager();
   StatsFuture t_input_future, mm_future, t_output_future;
   bool input_changed = !IsVecEqual(*prev_input_shape, input->shape());
   *prev_input_shape = input->shape();
diff --git a/mace/ops/opencl/out_of_range_check_test.cc b/mace/ops/opencl/out_of_range_check_test.cc
index eb223693..093e0fb4 100644
--- a/mace/ops/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
@@ -35,7 +35,7 @@ MaceStatus BufferToImageOpImpl(OpContext *context,
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                      static_cast<uint32_t>(image_shape[1])};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
 
   std::string kernel_name = "in_out_buffer_to_image";
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index 6b08761e..5233ccde 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -206,7 +206,7 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
     auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
     for (auto type : opencl_mem_types) {
       OpTestContext::Get()->GetDevice(device)
-          ->opencl_runtime()->set_mem_type(type);
+          ->gpu_runtime()->set_mem_type(type);
       Setup(device);
       MACE_RETURN_IF_ERROR(Run());
     }
@@ -242,8 +242,8 @@ MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
 void OpsTestNet::Sync() {
 #ifdef MACE_ENABLE_OPENCL
   if (net_ && device_type_ == DeviceType::GPU) {
-      OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime()
-          ->command_queue().finish();
+      OpTestContext::Get()->GetDevice(DeviceType::GPU)->gpu_runtime()
+      ->opencl_runtime()->command_queue().finish();
     }
 #endif
 }
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index cb797906..aa18b7c1 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -97,7 +97,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
     std::vector<int> paddings = Operation::GetRepeatedArgs<int>("paddings");
     float constant_value = Operation::GetOptionalArg<float>(
         "constant_value", 0.0);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::PadKernel<T>(paddings, constant_value));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index b2aef666..50372c3c 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -429,7 +429,7 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
  public:
   explicit PoolingOp(OpConstructContext *context)
       : PoolingOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::PoolingKernel<T>);
     } else {
       context->set_output_mem_type(MemoryType::GPU_BUFFER);
diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc
index 20f7e81c..863103b2 100644
--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
@@ -246,7 +246,7 @@ class ReduceMeanOp<DeviceType::GPU, T> : public ReduceMeanOpBase {
  public:
   explicit ReduceMeanOp(OpConstructContext *context)
       : ReduceMeanOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ReduceMeanKernel<T>(axis_, keep_dims_));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 40330060..3ccff3e6 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -195,7 +195,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
     std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ResizeBicubicKernel<T>(align_corners,
                                                               size[0],
                                                               size[1]));
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index 5ce6ef4a..748c2efd 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -331,7 +331,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
     std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ResizeBilinearKernel<T>(align_corners,
                                                                size[0],
                                                                size[1]));
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 4a7505ae..2518b407 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -364,7 +364,7 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit SoftmaxOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SoftmaxKernel<T>);
     } else {
       context->set_output_mem_type(MemoryType::GPU_BUFFER);
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index 7d422938..7fa9081d 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -308,7 +308,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
  public:
   explicit SpaceToBatchNDOp(OpConstructContext *context)
       : SpaceToBatchOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SpaceToBatchKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index 11e5ade3..39e603ae 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -94,7 +94,7 @@ class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
   explicit SpaceToDepthOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SpaceToDepthKernel<T>(block_size));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index 2e096631..0f9dcc04 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -105,7 +105,7 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
   explicit SplitOp(OpConstructContext *context)
       : Operation(context) {
     int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SplitKernel<T>(axis));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index 7927da3b..b469a3e3 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -82,7 +82,7 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit SqrDiffMeanOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SqrDiffMeanKernel<T>());
     } else {
       MACE_NOT_IMPLEMENTED;
-- 
GitLab