diff --git a/mace/core/device.cc b/mace/core/device.cc
index 35e8c7af6bda7ba27faa768cedc0cbdbfecef7f7..4eb547c2f4f4e29d6066cce6da93b32f9ffceeb5 100644
--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -33,8 +33,8 @@ CPURuntime *CPUDevice::cpu_runtime() {
 }
 
 #ifdef MACE_ENABLE_OPENCL
-OpenCLRuntime *CPUDevice::opencl_runtime() {
-  LOG(FATAL) << "CPU device should not call OpenCL Runtime";
+GPURuntime *CPUDevice::gpu_runtime() {
+  LOG(FATAL) << "CPU device should not call GPU Runtime";
   return nullptr;
 }
 #endif
diff --git a/mace/core/device.h b/mace/core/device.h
index b7fe5f329b99401d31b04af102b2ca1d32d06bff..627d46bede29b6d888813a470a6c1f3a76145561 100644
--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -21,7 +21,7 @@
 #include "mace/core/allocator.h"
 
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/core/runtime/opencl/gpu_runtime.h"
 #endif
 
 namespace mace {
@@ -33,7 +33,7 @@ class Device {
   virtual ~Device() {}
 
 #ifdef MACE_ENABLE_OPENCL
-  virtual OpenCLRuntime *opencl_runtime() = 0;
+  virtual GPURuntime *gpu_runtime() = 0;
 #endif  // MACE_ENABLE_OPENCL
   virtual CPURuntime *cpu_runtime() = 0;
 
@@ -50,7 +50,7 @@ class CPUDevice : public Device {
   virtual ~CPUDevice();
 
 #ifdef MACE_ENABLE_OPENCL
-  OpenCLRuntime *opencl_runtime() override;
+  GPURuntime *gpu_runtime() override;
 #endif
   CPURuntime *cpu_runtime() override;
 
diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc
index 09bb91816d0ff6aff45b68c85473d4a89b0ddc79..caea576773a8e226831a20ac5e4e5f7899e7ed24 100644
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -30,12 +30,13 @@ GPUDevice::GPUDevice(std::shared_ptr<Tuner<uint32_t>> tuner,
     runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
                                opencl_binary_storage, tuner)),
     allocator_(new OpenCLAllocator(runtime_.get())),
-    scratch_buffer_(new ScratchBuffer(allocator_.get())) {}
+    scratch_buffer_(new ScratchBuffer(allocator_.get())),
+    gpu_runtime_(new GPURuntime(runtime_.get())) {}
 
 GPUDevice::~GPUDevice() = default;
 
-OpenCLRuntime* GPUDevice::opencl_runtime() {
-  return runtime_.get();
+GPURuntime* GPUDevice::gpu_runtime() {
+  return gpu_runtime_.get();
 }
 
 Allocator *GPUDevice::allocator() {
diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h
index 1d36461b219ce5b28e4efa7ce6f769613eb92634..d3c7d98e88868ab02121a1151f360436894cecc9 100644
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -19,6 +19,7 @@
 
 #include "mace/core/device_context.h"
 #include "mace/core/device.h"
+#include "mace/core/runtime/opencl/gpu_runtime.h"
 #include "mace/core/runtime/opencl/opencl_allocator.h"
 
 namespace mace {
@@ -34,7 +35,7 @@ class GPUDevice : public CPUDevice {
             CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE,
             bool use_gemmlowp = false);
   ~GPUDevice();
-  OpenCLRuntime *opencl_runtime() override;
+  GPURuntime *gpu_runtime() override;
   Allocator *allocator() override;
   DeviceType device_type() const override;
   ScratchBuffer *scratch_buffer() override;
@@ -42,6 +43,7 @@ class GPUDevice : public CPUDevice {
   std::unique_ptr<OpenCLRuntime> runtime_;
   std::unique_ptr<OpenCLAllocator> allocator_;
   std::unique_ptr<ScratchBuffer> scratch_buffer_;
+  std::unique_ptr<GPURuntime> gpu_runtime_;
 };
 
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/gpu_runtime.cc b/mace/core/runtime/opencl/gpu_runtime.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8574ad48e4857eda4de415fdb17bba94a6bec7e1
--- /dev/null
+++ b/mace/core/runtime/opencl/gpu_runtime.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/gpu_runtime.h"
+
+#include "mace/core/runtime/opencl/scratch_image.h"
+
+namespace mace {
+
+GPURuntime::GPURuntime(mace::OpenCLRuntime *runtime)
+    : runtime_(runtime),
+      scratch_image_manager_(new ScratchImageManager),
+      mem_type_(MemoryType::GPU_IMAGE) {}
+
+GPURuntime::~GPURuntime() = default;
+
+OpenCLRuntime* GPURuntime::opencl_runtime() {
+  return runtime_;
+}
+
+ScratchImageManager* GPURuntime::scratch_image_manager() const {
+  return scratch_image_manager_.get();
+}
+
+bool GPURuntime::UseImageMemory() {
+  return this->mem_type_ == MemoryType::GPU_IMAGE;
+}
+
+void GPURuntime::set_mem_type(MemoryType type) {
+  this->mem_type_ = type;
+}
+
+
+}  // namespace mace
diff --git a/mace/core/runtime/opencl/gpu_runtime.h b/mace/core/runtime/opencl/gpu_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..fee776edb041c4b4dd2876f11c6bf46b4afe074c
--- /dev/null
+++ b/mace/core/runtime/opencl/gpu_runtime.h
@@ -0,0 +1,45 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_RUNTIME_H_
+#define MACE_CORE_RUNTIME_OPENCL_GPU_RUNTIME_H_
+
+#include <memory>
+
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+
+class OpenCLRuntime;
+class ScratchImageManager;
+
+class GPURuntime {
+ public:
+  explicit GPURuntime(OpenCLRuntime *runtime);
+  ~GPURuntime();
+  OpenCLRuntime *opencl_runtime();
+  ScratchImageManager *scratch_image_manager() const;
+
+  // TODO(liuqi): remove this function in the future, make decision at runtime.
+  bool UseImageMemory();
+  void set_mem_type(MemoryType type);
+
+ private:
+  OpenCLRuntime *runtime_;
+  std::unique_ptr<ScratchImageManager> scratch_image_manager_;
+  MemoryType mem_type_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_GPU_RUNTIME_H_
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index b552c65ab3f663e9e4db9add45f5e04913f0994c..904e74f6cd35d4b172808ba280ff2b53c54405ea 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -284,9 +284,7 @@ OpenCLRuntime::OpenCLRuntime(
     is_opencl_avaliable_(false),
     is_profiling_enabled_(false),
     opencl_version_(CL_VER_UNKNOWN),
-    gpu_type_(UNKNOWN),
-    mem_type_(MemoryType::GPU_IMAGE),
-    scratch_image_manager_(new ScratchImageManager) {
+    gpu_type_(UNKNOWN) {
   std::vector<cl::Platform> all_platforms;
   cl::Platform::get(&all_platforms);
   if (all_platforms.size() == 0) {
@@ -471,14 +469,6 @@ uint32_t OpenCLRuntime::device_compute_units() const {
   return device_compute_units_;
 }
 
-bool OpenCLRuntime::UseImageMemory() {
-  return this->mem_type_ == MemoryType::GPU_IMAGE;
-}
-
-void OpenCLRuntime::set_mem_type(MemoryType type) {
-  this->mem_type_ = type;
-}
-
 bool OpenCLRuntime::BuildProgramFromCache(
     const std::string &built_program_key,
     const std::string &build_options_str,
@@ -792,8 +782,4 @@ bool OpenCLRuntime::is_profiling_enabled() const {
   return is_profiling_enabled_;
 }
 
-ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
-  return scratch_image_manager_.get();
-}
-
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 546b80086949215f82ba7fd831f96c509590a712..1e189b8eeb5f6e347d41680cc3977643757af22f 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -83,11 +83,7 @@ class OpenCLRuntime {
   uint64_t device_global_mem_cache_size() const;
   uint32_t device_compute_units() const;
   Tuner<uint32_t> *tuner();
-  ScratchImageManager *scratch_image_manager() const;
   bool is_opencl_avaliable();
-  // TODO(liuqi): remove this function in the future, make decision at runtime.
-  bool UseImageMemory();
-  void set_mem_type(MemoryType type);
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint64_t GetDeviceMaxWorkGroupSize();
@@ -135,8 +131,6 @@ class OpenCLRuntime {
   bool is_profiling_enabled_;
   OpenCLVersion opencl_version_;
   GPUType gpu_type_;
-  MemoryType mem_type_;
-  std::unique_ptr<ScratchImageManager> scratch_image_manager_;
   // All OpenCL object must be a pointer and manually deleted before unloading
   // OpenCL library.
   std::shared_ptr<cl::Context> context_;
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index bbef2c5d5a35331fbcecb5fc7b8197adeb3b2afa..5123e670bc900fb5e6f4be145f8fc64be5105b5d 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -109,7 +109,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
             (!is_quantize_model && HasQuantizedTensor(net_def))));
 #ifdef MACE_ENABLE_OPENCL
     diffused_buffer_ = diffused_buffer_ || (device_type == DeviceType::GPU &&
-        device->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
+        device->gpu_runtime()->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
             static_cast<uint64_t>(model_data_size));
 #endif
     if (diffused_buffer_) {
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 9244d62a8a9bfae3b7d93c901092ec928970b454..bcaff34da3372019fd1fc4f15b566a3c62402d93 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -69,8 +69,8 @@ void UnloadModelData(const unsigned char *model_data,
 #ifdef MACE_ENABLE_OPENCL
 MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
   // Check OpenCL avaliable
-  auto runtime = device->opencl_runtime();
-  if (!runtime->is_opencl_avaliable()) {
+  auto runtime = device->gpu_runtime();
+  if (!runtime->opencl_runtime()->is_opencl_avaliable()) {
     LOG(WARNING) << "The device does not support OpenCL";
     return MaceStatus::MACE_OUT_OF_RESOURCES;
   }
@@ -678,8 +678,8 @@ MaceStatus MaceEngine::Impl::Run(
 
 #ifdef MACE_ENABLE_OPENCL
   if (device_type_ == GPU) {
-    device_->opencl_runtime()->command_queue().finish();
-    device_->opencl_runtime()->SaveBuiltCLProgram();
+    device_->gpu_runtime()->opencl_runtime()->command_queue().finish();
+    device_->gpu_runtime()->opencl_runtime()->SaveBuiltCLProgram();
   }
 #endif
   for (auto &output : *outputs) {
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index b904b5c275373e48f59358b8a238f61dd6917bf6..fe8862bb85bfe472ef558411632525ab8e6fd7ef 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -81,7 +81,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
     auto relux_max_limit = static_cast<T>(
         Operation::GetOptionalArg<float>("max_limit", 0.0f));
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(
           new opencl::image::ActivationKernel<T>(type, relux_max_limit));
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index 4040de1fa50eacea06c654b47c0515918b505d61..0fe0c7b4371839f6a9c4449b633925d834f91efe 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -106,7 +106,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit AddNOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::AddNKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 1758f79b799a11df6b075222ffb022be5a71b615..3ca5592a8bebb8126809b50edff8a9ba1a6ef430 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -149,7 +149,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
         Operation::GetOptionalArg<std::string>("activation", "NOOP"));
     float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::BatchNormKernel<T>(
           epsilon, activation, relux_max_limit));
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index 5cc6a1e025c54b755a61d3e0c5331d0f38aa5450..3aa5acecbd82755dcdfb5aa007e076e3cb950e84 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -265,7 +265,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
  public:
   explicit BatchToSpaceNDOp(OpConstructContext *context)
       : BatchToSpaceOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 59579fa518bd613700251ee74b2265025337d58d..9190cf95859ea8dd4a0b27d93e70aad8cf47825f 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -101,7 +101,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
         data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
             "data_format", NHWC))) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::BiasAddKernel<T>);
     } else {
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index 04c6a88dc99c06ac9f401a1839205d349b32ff90..d4404c618d0a06c75892782fab7bcd48866e5ebc 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -84,7 +84,7 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
   explicit ChannelShuffleOp(OpConstructContext *context)
       : Operation(context) {
     const int groups = Operation::GetOptionalArg<int>("group", 1);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index eec11e0bb132055238d0dee95091d088729799bc..3fa5ef2c5097e9c2a38f68fac1707a46bb440777 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -196,7 +196,7 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
  public:
   explicit ConcatOp(OpConstructContext *context)
       : ConcatOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ConcatKernel<T>(axis_));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index a5cbec7411aaa47f82717e50a71ee1cf3d4d87e6..0a0d3bb51b7e412db06712e401e5268c53bf10b7 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -963,7 +963,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
         wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::Conv2dKernel<T>);
     } else {
@@ -974,7 +974,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
     // Transform filter tensor to target format
     if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
         (kernel_->CheckUseWinograd(
-          context->device()->opencl_runtime(),
+          context->device()->gpu_runtime()->opencl_runtime(),
           context->workspace()->GetTensor(
               operator_def_->input(1))->shape(),
           std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
index b056f21c189f862da14481bec3111edf5af8687c..7b705069f14b76fb785907116939144cf9897d18 100644
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -113,7 +113,7 @@ class CropOp<DeviceType::GPU, T> : public Operation {
   explicit CropOp(OpConstructContext *context)
       : Operation(context) {
     const int axis = Operation::GetOptionalArg<int>("axis", 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::CropKernel<T>(
           axis, Operation::GetRepeatedArgs<int>("offset")));
     } else {
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 5697c8413544742ad1517154c84511f9031cbabb..575e81addca1333b7481b604d3aaff9ef660719b 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -360,7 +360,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
   explicit Deconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::Deconv2dKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index e18cc106f4fba10c4f054cd7d8c219b0ef032118..ee06075a9766bf362051cd202dce75c0014ca5a5 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -96,7 +96,7 @@ class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
   explicit DepthToSpaceOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 8a85ab464ca0911b95a3ea4f039e1c61eb60da17..2f849ef7f1252087673427be30f24c40da60c58b 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -492,7 +492,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
   explicit DepthwiseConv2dOp(OpConstructContext *context)
       : DepthwiseConv2dOpBase(context) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
     } else {
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index 3f10a514cec8712b583b1f0fcae2166fe747da46..a4e7148e6159bc5129f84b8dc68d9aa45b46e3fa 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -410,7 +410,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
   explicit DepthwiseDeconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 863b69edc2033e54866f5935b097d4f93c968395..1a2e09081fd6f3fd302aad96d113417a7d65bcba 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1088,7 +1088,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
             "scalar_input_index", 1);
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::EltwiseKernel<T>(
           type, coeff, scalar_input, scalar_input_index));
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index ef919d9292bab8b2474a40ab30053b587bd79d96..31b1fb058dcae66f9a64c8cd04d8a7cb5dcdd2a1 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -194,7 +194,7 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
   explicit FullyConnectedOp(OpConstructContext *context)
       : FullyConnectedOpBase(context) {
     MemoryType mem_type;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
     } else {
diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc
index dfbfa155a31377dbbbd20cbd7d6c6ebe5df48838..bc34b969bb9b7bebf17f10552cc9a55751fdaae2 100644
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -34,7 +34,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
         Operation::GetOptionalArg<float>("scalar_input",
                                          0.0));
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/opencl/buffer/buffer_transform.cc b/mace/ops/opencl/buffer/buffer_transform.cc
index 9ba3f81d1e7b59bd1c7b0b015616da1cec775ac7..5bfc53899e321964c68d812290ccfee3a9ff9b3f 100644
--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
@@ -47,7 +47,7 @@ MaceStatus TransformConv2DFilter(
   MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
   output->Reshape(input->shape());
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
@@ -116,7 +116,7 @@ MaceStatus TransformDWConv2DFilter(
   MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
   output->Reshape(input->shape());
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
@@ -173,7 +173,7 @@ MaceStatus TransformArgument(
   MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
   output->Reshape(input->shape());
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc
index ce405e9f3da2865c4a2547389f15cdb9434f6996..757283792f016cf58ec2beb711a107ad11c0172c 100644
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -31,7 +31,7 @@ MaceStatus BufferTypeTransform(
     Tensor *output) {
   MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
 
   const uint32_t gws =
diff --git a/mace/ops/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc
index 62e77b17b3fe8b0d80d0d5b8665c17b0fa8ca728..abe7d93be2a24a513f231cfdd36c71107004d33b 100644
--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -43,7 +43,7 @@ MaceStatus Conv2d1x1(OpContext *context,
   const index_t in_height = padded_input->dim(1);
   const index_t in_width = padded_input->dim(2);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc
index f9cc804d7ca04529eabe16aea9d4f8a453289640..e8ac509ccfe957717c6e206cdff1837211595326 100644
--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -48,7 +48,7 @@ MaceStatus Conv2dGeneral(OpContext *context,
   const index_t filter_height = filter->dim(2);
   const index_t filter_width = filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.cc b/mace/ops/opencl/buffer/depthwise_conv2d.cc
index 0ba4526c0199c3262e95a0aace503f0977157e97..d2c335999c6680f8b4ee2c01a28d4c1ca049f87e 100644
--- a/mace/ops/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc
@@ -48,7 +48,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
   const index_t filter_height = filter->dim(2);
   const index_t filter_width = filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h
index 4684d687874fdc37d0227b222201a59e29425e79..de7d76108fd40e65cb745aa4172adcc993cc6302 100644
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -92,7 +92,7 @@ MaceStatus PoolingKernel<T>::Compute(
   bool input_changed = !IsVecEqual(input_shape_, input->shape());
   input_shape_ = input->shape();
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
 
   // pad input
   std::vector<index_t> padded_input_shape = input->shape();
diff --git a/mace/ops/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h
index 3147a935b8116fbbb1daa1e6cb5433df552087f8..248ee0c85c1ddf3c45a52e74c966c87a372528d7 100644
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -75,7 +75,7 @@ MaceStatus SoftmaxKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/buffer/utils.cc b/mace/ops/opencl/buffer/utils.cc
index b4214a0af02f4967374363f88fe54854e80055a8..141a96b748bc5430aff17396bc0661f737a6df40 100644
--- a/mace/ops/opencl/buffer/utils.cc
+++ b/mace/ops/opencl/buffer/utils.cc
@@ -47,7 +47,7 @@ MaceStatus PadInput(OpContext *context,
       static_cast<uint32_t>(padded_height * batch)
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/activation.h b/mace/ops/opencl/image/activation.h
index 93944b5b7810ef6048b623f898fc6c8f69609272..80713c36977b495ae857f0af75c031a424c933ea 100644
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -66,7 +66,7 @@ MaceStatus ActivationKernel<T>::Compute(
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h
index 7692ac06b8e281295381b7ecf77d446784988859..48f6d8f840dc2457ec886842b824c136d1616b1a 100644
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -57,7 +57,7 @@ MaceStatus AddNKernel<T>::Compute(
   const index_t width = input_tensors[0]->dim(2);
   const index_t channels = input_tensors[0]->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   for (size_t i = 1; i < size; ++i) {
diff --git a/mace/ops/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h
index 5685c5145814f0428b3fade69a4672349cd19250..689088300e61ab3561eac831147ac0c4d61c9bf2 100644
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -85,7 +85,7 @@ MaceStatus BatchNormKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h
index 9d91802627c840538b70d5a4f994d3ca572e8504..35281c7072e96fe05257f143b6624058e764bdc9 100644
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -68,7 +68,7 @@ MaceStatus BatchToSpaceKernel<T>::Compute(
       chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
       static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h
index 25e2392edc055af6d5630b371e665160eb18b147..a37ee2b18b9c1c1344e21bdf5096d5e18b304fea 100644
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -62,7 +62,7 @@ MaceStatus BiasAddKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
index 14a0ae4b3e474eb464580701446346248f5d1982..6ff3284ea69d8ef1be1d7e9f6c62d62ca8fa8527 100644
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -98,7 +98,7 @@ MaceStatus BufferToImage<T>::Compute(
     }
   }
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h
index 53acbf15cdef206bb43b8dac9eb2a7d1b7c1b1ce..f890c0c3309988cad9acc380560c3358f736e775 100644
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -70,7 +70,7 @@ MaceStatus ChannelShuffleKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
 
   MACE_OUT_OF_RANGE_DEFINITION;
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/concat.cc b/mace/ops/opencl/image/concat.cc
index aab72c5445709049e78543c1b4246f0eec6f2724..5dfe666eb42fa63be298355f3e0428cfb3f05235 100644
--- a/mace/ops/opencl/image/concat.cc
+++ b/mace/ops/opencl/image/concat.cc
@@ -65,7 +65,7 @@ MaceStatus Concat2(OpContext *context,
       static_cast<uint32_t>(batch * height),
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
@@ -126,7 +126,7 @@ MaceStatus ConcatN(OpContext *context,
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
index f88882ee645814f81d13bef5cd80ef9ebcb5092f..57be075076ffa2cf077049b076475db1d1c67454 100644
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -95,7 +95,7 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
   const index_t width_blocks = RoundUpDiv4(width);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index 3e5aee909c89bbed8e94488c5d38d8be3f93615d..f7905a0c02bc14346cef2cca990d23f2a67d30c1 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -83,7 +83,7 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index 120a3daa3067d91118c101e8b95798f7bde84a1d..28bdea6c7f7ffa41d07ccf734c9029251d32cf82 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -91,7 +91,7 @@ extern MaceStatus Conv2d(OpContext *context,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv4(width);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
index c8f98a4ca7a2f2cdf8ba96135444e31e25ed1867..a83349c49ebe16c9ade1356b73a751bfca100d26 100644
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -141,7 +141,7 @@ MaceStatus CropKernel<T>::Compute(
       static_cast<uint32_t>(output->dim(0) * output->dim(1))
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
index f3d6cbe92049380634540ae94419b96a2a1444e1..a8dd9c26c485b38d6852c06c79f9c11b962d3b77 100644
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -92,7 +92,7 @@ MaceStatus Deconv2dKernel<T>::Compute(
   const int align_w = stride_w - 1 - padding_w;
   const int kernel_size = filter->dim(2) * filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h
index 77c4bd53dfc661fd23381d9e8ebac3cf33c15017..1783b81316c13fc7d4eec5aa9004c488e6ab707b 100644
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
       static_cast<uint32_t>(output_width),
       static_cast<uint32_t>(output_height * batch)
   };
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/depthwise_conv2d.cc b/mace/ops/opencl/image/depthwise_conv2d.cc
index 02409ebeda304dabc78f98c45688b9c4ce4a64de..57a4415e0ff3726dc20ce92024d16cdb48504e5b 100644
--- a/mace/ops/opencl/image/depthwise_conv2d.cc
+++ b/mace/ops/opencl/image/depthwise_conv2d.cc
@@ -93,7 +93,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel->get() == nullptr) {
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h
index 96fdfa51e110395f3028003f3058a029765519f5..d07a164955dcd4e2c54efac3f9fe9b9039d01f90 100644
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -98,7 +98,7 @@ MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
   const int align_w = stride_w - 1 - padding_w;
   const int kernel_size = filter->dim(2) * filter->dim(3);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
index 2afb334233731307582d83ea77d2ec1ad77ce661..9600d501fdb579e3b0e0075c8f13ad28a7ce8705 100644
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -117,7 +117,7 @@ MaceStatus EltwiseKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(batch_height_pixels)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h
index 962ffaf082ca93e1f6129fa2f5d123c0e3454603..d52e927fffd8b89c84cb0952864b6f53addc1b62 100644
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -64,7 +64,7 @@ MaceStatus FullyConnectedKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
index 6ca73fa6af9b8a39c43d6586d9167ca8655d6ffa..f9c3b011d120d697e856fe4b997e73eed63d607c 100644
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -92,7 +92,7 @@ MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
       break;
   }
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h
index 546b4a792de1c892a3fd9d6c0e11f255b9cb7501..265f2e10f9f536db9a692bf15d966153c05949e6 100644
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -71,7 +71,7 @@ MaceStatus LSTMCellKernel<T>::Compute(
   const index_t hidden_units = pre_output->dim(1);
   const index_t w_blocks = hidden_units >> 2;
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h
index 763082f610f5b4a115a76fc55be08c459a278d14..1681a8f8d98843c54b62aec24637924a80be9e1d 100644
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -82,7 +82,7 @@ MaceStatus MatMulKernel<T>::Compute(
       static_cast<uint32_t>(height_blocks * batch),
   };
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h
index cb0c390b667a46329ab4f9728caeea10f1eea0c7..8d1cae3e6fe3bd2830b9742c6abb0e9b4c0371df 100644
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -80,7 +80,7 @@ MaceStatus PadKernel<T>::Compute(
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h
index f246efa426618e9c197f30d253e23338bd11f73d..1af677403bfa3160aedf8266bc24cf45baf04b37 100644
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -112,7 +112,7 @@ MaceStatus PoolingKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/reduce_mean.h b/mace/ops/opencl/image/reduce_mean.h
index 95b51d86f883338fd0e4e57952edfd5965f85a61..3280691c9d303d08048bcb23bce2ab040c72b9e7 100644
--- a/mace/ops/opencl/image/reduce_mean.h
+++ b/mace/ops/opencl/image/reduce_mean.h
@@ -76,7 +76,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h
index bf5bfcf1921254c3939f77a5f3dc7711ea780289..bf72ee78c9edf67a6de31a0e2c9bae9f3ab35ceb 100644
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -102,7 +102,7 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h
index b3f1b09c6ee08f356f328e9e729c573abd5bb4e4..1eb599c98f27b7adad019540154c1b8bb5c59296 100644
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -107,7 +107,7 @@ MaceStatus ResizeBilinearKernel<T>::Compute(
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h
index ffd5ec89a60e90aa57a4192d30022b7cd7586d8d..a19d9483719fee79fdcb3aad9a15191ffb7441a1 100644
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -102,7 +102,7 @@ MaceStatus SoftmaxKernel<T>::Compute(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h
index f2baaba48259da64f2f8ed18620da37edd154245..c2190c681d59d7fbf72cc5d4fa821a71a914796d 100644
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -66,7 +66,7 @@ MaceStatus SpaceToBatchKernel<T>::Compute(
       chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
       static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h
index e225b37693377acf57f2d91b17cc3269bc8a20a3..1df75ef831f563835317d51241456c2941c55af4 100644
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -79,7 +79,7 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
                               &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h
index 7b7f790597f4daba916a0ab2cc1d103fdf11df26..d0427a4f16ce5b18d37c09ce274e9d1fd621661e 100644
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -70,7 +70,7 @@ MaceStatus SplitKernel<T>::Compute(
         output_list[i]->ResizeImage(output_shape, image_shape));
   }
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h
index d0c217fe450018d038e2d617fe4bdf5e6c4ba5de..ba84a5ef04fabb85f2943db96fe9a044f796d9a1 100644
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -72,7 +72,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
                               &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
   if (kernel_.get() == nullptr) {
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
index a9bd717155b7cfed5f5a6cac32a64d57fad63545..8d684e59f90d3ced55f632414ad8890b7764452b 100644
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -37,7 +37,7 @@ MaceStatus WinogradInputTransform(OpContext *context,
                                   Tensor *output_tensor,
                                   uint32_t *kwg_size,
                                   StatsFuture *future) {
-  OpenCLRuntime *runtime = context->device()->opencl_runtime();
+  OpenCLRuntime *runtime = context->device()->gpu_runtime()->opencl_runtime();
   const index_t out_width = output_tensor->dim(2);
 
   MACE_OUT_OF_RANGE_DEFINITION;
@@ -119,7 +119,7 @@ MaceStatus WinogradOutputTransform(OpContext *context,
                                    Tensor *output_tensor,
                                    uint32_t *kwg_size,
                                    StatsFuture *future) {
-  OpenCLRuntime *runtime = context->device()->opencl_runtime();
+  OpenCLRuntime *runtime = context->device()->gpu_runtime()->opencl_runtime();
   auto &output_shape = output_tensor->shape();
 
   MACE_OUT_OF_RANGE_DEFINITION;
@@ -227,8 +227,9 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                        std::vector<index_t> *prev_input_shape,
                                        Tensor *output,
                                        uint32_t *kwg_size[3]) {
-  OpenCLRuntime *runtime = context->device()->opencl_runtime();
-  ScratchImageManager *scratch_manager = runtime->scratch_image_manager();
+  OpenCLRuntime *runtime = context->device()->gpu_runtime()->opencl_runtime();
+  ScratchImageManager *scratch_manager =
+      context->device()->gpu_runtime()->scratch_image_manager();
   StatsFuture t_input_future, mm_future, t_output_future;
   bool input_changed = !IsVecEqual(*prev_input_shape, input->shape());
   *prev_input_shape = input->shape();
diff --git a/mace/ops/opencl/out_of_range_check_test.cc b/mace/ops/opencl/out_of_range_check_test.cc
index eb2236931b08561715ef08e3e3194084261004d8..093e0fb47e0440cdaae8531ae2875bf5c1295763 100644
--- a/mace/ops/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
@@ -35,7 +35,7 @@ MaceStatus BufferToImageOpImpl(OpContext *context,
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                      static_cast<uint32_t>(image_shape[1])};
 
-  auto runtime = context->device()->opencl_runtime();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
 
   std::string kernel_name = "in_out_buffer_to_image";
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index 6b08761e34eec22992db490c21740865bdfe3660..5233ccde1e6a7043f864e558045b823bd63c9507 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -206,7 +206,7 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
     auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
     for (auto type : opencl_mem_types) {
       OpTestContext::Get()->GetDevice(device)
-          ->opencl_runtime()->set_mem_type(type);
+          ->gpu_runtime()->set_mem_type(type);
       Setup(device);
       MACE_RETURN_IF_ERROR(Run());
     }
@@ -242,8 +242,8 @@ MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
 void OpsTestNet::Sync() {
 #ifdef MACE_ENABLE_OPENCL
   if (net_ && device_type_ == DeviceType::GPU) {
-      OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime()
-          ->command_queue().finish();
+      OpTestContext::Get()->GetDevice(DeviceType::GPU)->gpu_runtime()
+      ->opencl_runtime()->command_queue().finish();
     }
 #endif
 }
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index cb7979063097a07be88337b5b14db63a7ffe99f4..aa18b7c1c519f5ce2b27967647ddc900199a01f2 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -97,7 +97,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
     std::vector<int> paddings = Operation::GetRepeatedArgs<int>("paddings");
     float constant_value = Operation::GetOptionalArg<float>(
         "constant_value", 0.0);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::PadKernel<T>(paddings, constant_value));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index b2aef666266dfcd77b06047eab7891fd6cb82cef..50372c3cf1f1603d80eec28bce0d701535b9467d 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -429,7 +429,7 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
  public:
   explicit PoolingOp(OpConstructContext *context)
       : PoolingOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::PoolingKernel<T>);
     } else {
       context->set_output_mem_type(MemoryType::GPU_BUFFER);
diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc
index 20f7e81c8b54165388de9f5fd2f359c4d42d1862..863103b28fc607aa4003840ee72aefa88b917312 100644
--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
@@ -246,7 +246,7 @@ class ReduceMeanOp<DeviceType::GPU, T> : public ReduceMeanOpBase {
  public:
   explicit ReduceMeanOp(OpConstructContext *context)
       : ReduceMeanOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ReduceMeanKernel<T>(axis_, keep_dims_));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 403300607cfcb929169a18946eff79085d6c534c..3ccff3e6010fd931afcaf1775e5bc21f88836520 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -195,7 +195,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
     std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ResizeBicubicKernel<T>(align_corners,
                                                               size[0],
                                                               size[1]));
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index 5ce6ef4a44a4bdb2f9d3b11057e9b317867d62d5..748c2efd13b4271d7f19964a987dcc0e28b9cc6f 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -331,7 +331,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
     std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::ResizeBilinearKernel<T>(align_corners,
                                                                size[0],
                                                                size[1]));
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 4a7505ae79bcbc211ae9fa17f65a4f941b8988a2..2518b407301952c4fa1edf16f0a5a128b427a538 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -364,7 +364,7 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit SoftmaxOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SoftmaxKernel<T>);
     } else {
       context->set_output_mem_type(MemoryType::GPU_BUFFER);
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index 7d422938c77516f3e11ef3cf5e9f8b7bc7c5db15..7fa9081dfc3bb59b8ab975f54588b15299c32e49 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -308,7 +308,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
  public:
   explicit SpaceToBatchNDOp(OpConstructContext *context)
       : SpaceToBatchOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SpaceToBatchKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index 11e5ade322e73fdeeee939f40f6b19243d3afe50..39e603ae615a0c66e0f11174d9c09200b234f003 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -94,7 +94,7 @@ class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
   explicit SpaceToDepthOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SpaceToDepthKernel<T>(block_size));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index 2e09663178c45495b670b75a72ac7a013f478dc0..0f9dcc04bdb9c5b229f08c2b59f3e9551020f7a6 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -105,7 +105,7 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
   explicit SplitOp(OpConstructContext *context)
       : Operation(context) {
     int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SplitKernel<T>(axis));
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index 7927da3b9a321d417386e2c76c8494e45a3417f2..b469a3e3b83b8d85241e0508b69b1a16fe62fe44 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -82,7 +82,7 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit SqrDiffMeanOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SqrDiffMeanKernel<T>());
     } else {
       MACE_NOT_IMPLEMENTED;