diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index 9a689f45f9c2ad63bc1711a28b81b1e39b56cef8..26fb2d0b0e4e17355efc9122c00a25147ffe00ba 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -22,7 +22,6 @@
 
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
 #include "mace/benchmark/statistics.h"
@@ -257,36 +256,40 @@ int Main(int argc, char **argv) {
 
   mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
 
-  // config runtime
-  MaceStatus ret = mace::SetOpenMPThreadPolicy(
+  // configuration
+  MaceStatus mace_status;
+  MaceEngineConfig config(device_type);
+  mace_status = config.SetCPUThreadPolicy(
       FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
+      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
       true);
-  if (ret != MACE_SUCCESS) {
-    LOG(WARNING) << "Set openmp or cpu affinity failed.";
+  if (mace_status != MACE_SUCCESS) {
+    LOG(INFO) << "Set openmp or cpu affinity failed.";
   }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
   if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-
+    // DO NOT USE tmp directory.
+    // Please use APP's own directory and make sure the directory exists.
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
+    const std::string storage_path =
+        std::string(storage_path_ptr == nullptr ?
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
     std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
 
-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
   }
 #endif  // MACE_ENABLE_OPENCL
 
-  const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
-  const std::string kernel_file_path =
-      std::string(kernel_path == nullptr ?
-                  "/data/local/tmp/mace_run/interior" : kernel_path);
-
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(kernel_file_path));
-  SetKVStorageFactory(storage_factory);
-
   // Create Engine
   std::shared_ptr<mace::MaceEngine> engine;
   MaceStatus create_engine_status;
@@ -306,7 +309,7 @@ int Main(int argc, char **argv) {
                                  model_data_file_ptr,
                                  input_names,
                                  output_names,
-                                 device_type,
+                                 config,
                                  &engine);
 #else
   create_engine_status =
@@ -314,7 +317,7 @@ int Main(int argc, char **argv) {
                                 model_data_file_ptr,
                                 input_names,
                                 output_names,
-                                device_type,
+                                config,
                                 &engine);
 #endif
   if (create_engine_status != MaceStatus::MACE_SUCCESS) {
diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc
index 07776bc12fbcf6fd9db34577d8a0ea63a766f865..d9b5c3c226049a43a43d9a22feee04ad4a9f5add 100644
--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -13,30 +13,12 @@
 // limitations under the License.
 
 #include "mace/core/allocator.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_allocator.h"
-#endif
 
 namespace mace {
 
-std::map<int32_t, Allocator *> *gAllocatorRegistry() {
-  static std::map<int32_t, Allocator *> g_allocator_registry;
-  return &g_allocator_registry;
+Allocator *GetCPUAllocator() {
+  static CPUAllocator allocator;
+  return &allocator;
 }
 
-Allocator *GetDeviceAllocator(DeviceType type) {
-  auto iter = gAllocatorRegistry()->find(type);
-  if (iter == gAllocatorRegistry()->end()) {
-    LOG(ERROR) << "Allocator not found for device " << type;
-    return nullptr;
-  }
-  return iter->second;
-}
-
-MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
-#ifdef MACE_ENABLE_OPENCL
-MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
-#endif
-MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());
-
 }  // namespace mace
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index a212e7f91434e13c6d4dd101bab16ce855153842..51f04741ca9b2d8d729673c14162024a1d9390d5 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -26,8 +26,6 @@
 #include "mace/core/registry.h"
 #include "mace/core/types.h"
 #include "mace/core/runtime_failure_mock.h"
-#include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 
 namespace mace {
 
@@ -138,26 +136,8 @@ class CPUAllocator : public Allocator {
   bool OnHost() const override { return true; }
 };
 
-std::map<int32_t, Allocator *> *gAllocatorRegistry();
-
-Allocator *GetDeviceAllocator(DeviceType type);
-
-struct AllocatorRegisterer {
-  explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) {
-    if (gAllocatorRegistry()->count(type)) {
-      LOG(ERROR) << "Allocator for device type " << type
-                 << " registered twice. This should not happen."
-                 << gAllocatorRegistry()->count(type);
-      std::exit(1);
-    }
-    gAllocatorRegistry()->emplace(type, alloc);
-  }
-};
-
-#define MACE_REGISTER_ALLOCATOR(type, alloc)                                  \
-  namespace {                                                                 \
-  static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \
-  }
+// Global CPU allocator used for CPU/GPU/DSP
+Allocator *GetCPUAllocator();
 
 }  // namespace mace
 
diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h
index 3e1cca9323001359207f3971803fbbc017bf95b5..50ec4eade9c05eb12d0b555595a665e590a14965 100644
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"
 
 namespace mace {
 
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index b349cf4b4de46a39c51822d880e5132944ff1a22..c57a1714aa91e469e5e2d6ec6de392f8ca868821 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -218,9 +218,9 @@ class Buffer : public BufferBase {
 
 class Image : public BufferBase {
  public:
-  Image()
+  explicit Image(Allocator *allocator)
       : BufferBase(0),
-        allocator_(GetDeviceAllocator(GPU)),
+        allocator_(allocator),
         buf_(nullptr),
         mapped_buf_(nullptr) {}
 
diff --git a/mace/core/device.cc b/mace/core/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09f5a068b934b535347a8992b01f162466f0b4c6
--- /dev/null
+++ b/mace/core/device.cc
@@ -0,0 +1,42 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/device.h"
+
+namespace mace {
+
+CPUDevice::CPUDevice(const int num_threads)
+    : cpu_runtime_(new CPURuntime(num_threads)) {}
+
+CPUDevice::~CPUDevice() = default;
+
+CPURuntime *CPUDevice::cpu_runtime() {
+  return cpu_runtime_.get();
+}
+
+#ifdef MACE_ENABLE_OPENCL
+OpenCLRuntime *CPUDevice::opencl_runtime() {
+  return nullptr;
+}
+#endif
+
+Allocator *CPUDevice::allocator() {
+  return GetCPUAllocator();
+}
+
+DeviceType CPUDevice::device_type() const {
+  return DeviceType::CPU;
+}
+
+}  // namespace mace
diff --git a/mace/core/device.h b/mace/core/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..7336d79f8cb597005bb6c9021f320396ec9e48f0
--- /dev/null
+++ b/mace/core/device.h
@@ -0,0 +1,60 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_DEVICE_H_
+#define MACE_CORE_DEVICE_H_
+
+#include <memory>
+
+#include "mace/core/runtime/cpu/cpu_runtime.h"
+#include "mace/core/allocator.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#endif
+
+namespace mace {
+
+class Device {
+ public:
+  virtual ~Device() {}
+
+#ifdef MACE_ENABLE_OPENCL
+  virtual OpenCLRuntime *opencl_runtime() = 0;
+#endif
+  virtual CPURuntime *cpu_runtime() = 0;
+
+  virtual Allocator *allocator() = 0;
+  virtual DeviceType device_type() const = 0;
+};
+
+class CPUDevice : public Device {
+ public:
+  explicit CPUDevice(const int num_threads);
+  virtual ~CPUDevice();
+
+#ifdef MACE_ENABLE_OPENCL
+  OpenCLRuntime *opencl_runtime() override;
+#endif
+  CPURuntime *cpu_runtime() override;
+
+  Allocator *allocator() override;
+  DeviceType device_type() const override;
+
+ private:
+  std::unique_ptr<CPURuntime> cpu_runtime_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_DEVICE_H_
diff --git a/mace/core/device_context.cc b/mace/core/device_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88a965fa2635da79dda3f3158b084c8ec8f41b11
--- /dev/null
+++ b/mace/core/device_context.cc
@@ -0,0 +1,73 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/device_context.h"
+
+#include <sys/stat.h>
+
+namespace mace {
+
+namespace {
+
+const char *kPrecompiledProgramFileName = "mace_cl_compiled_program.bin";
+
+std::string FindFirstExistPath(const std::vector<std::string> &paths) {
+  std::string result;
+  struct stat st;
+  for (auto path : paths) {
+    if (stat(path.c_str(), &st) == 0) {
+      if (S_ISREG(st.st_mode)) {
+        result = path;
+        break;
+      }
+    }
+  }
+  return result;
+}
+}  // namespace
+
+GPUContext::GPUContext(const std::string &storage_path,
+                       const std::vector<std::string> &opencl_binary_paths,
+                       const std::string &opencl_parameter_path)
+    : storage_factory_(new FileStorageFactory(storage_path)),
+      opencl_tuner_(new Tuner<uint32_t>(opencl_parameter_path)) {
+
+  if (!storage_path.empty()) {
+    opencl_cache_storage_ =
+        storage_factory_->CreateStorage(kPrecompiledProgramFileName);
+  }
+
+  std::string precompiled_binary_path =
+      FindFirstExistPath(opencl_binary_paths);
+  if (!precompiled_binary_path.empty()) {
+    opencl_binary_storage_.reset(
+        new FileStorage(precompiled_binary_path));
+  }
+}
+
+GPUContext::~GPUContext() = default;
+
+KVStorage *GPUContext::opencl_binary_storage() {
+  return opencl_binary_storage_.get();
+}
+
+KVStorage *GPUContext::opencl_cache_storage() {
+  return opencl_cache_storage_.get();
+}
+
+Tuner<uint32_t> *GPUContext::opencl_tuner() {
+  return opencl_tuner_.get();
+}
+
+}  // namespace mace
diff --git a/mace/core/device_context.h b/mace/core/device_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..21d076730f25b3070c0f680de5b6370d860612f9
--- /dev/null
+++ b/mace/core/device_context.h
@@ -0,0 +1,47 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_DEVICE_CONTEXT_H_
+#define MACE_CORE_DEVICE_CONTEXT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/file_storage.h"
+#include "mace/utils/tuner.h"
+
+namespace mace {
+
+class GPUContext {
+ public:
+  GPUContext(const std::string &storage_path = "",
+             const std::vector<std::string> &opencl_binary_path = {},
+             const std::string &opencl_parameter_path = "");
+  ~GPUContext();
+
+  KVStorage *opencl_binary_storage();
+  KVStorage *opencl_cache_storage();
+  Tuner<uint32_t> *opencl_tuner();
+
+ private:
+  std::unique_ptr<KVStorageFactory> storage_factory_;
+  std::unique_ptr<Tuner<uint32_t>> opencl_tuner_;
+  std::unique_ptr<KVStorage> opencl_binary_storage_;
+  std::unique_ptr<KVStorage> opencl_cache_storage_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_DEVICE_CONTEXT_H_
diff --git a/mace/core/file_storage.cc b/mace/core/file_storage.cc
index 99731a813f26b9c5593b492b0f2f16ec1e653f40..7c1fb35b3ebac4df42d2aac451ccbbbf0b4de464 100644
--- a/mace/core/file_storage.cc
+++ b/mace/core/file_storage.cc
@@ -28,10 +28,36 @@
 
 namespace mace {
 
-std::shared_ptr<KVStorageFactory> kStorageFactory = nullptr;
+class FileStorageFactory::Impl {
+ public:
+  explicit Impl(const std::string &path);
+
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
+
+ private:
+  std::string path_;
+};
+
+FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
+
+std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
+    const std::string &name) {
+  return std::move(std::unique_ptr<KVStorage>(
+      new FileStorage(path_ + "/" + name)));
+}
+
+FileStorageFactory::FileStorageFactory(const std::string &path):
+    impl_(new FileStorageFactory::Impl(path)) {}
+
+FileStorageFactory::~FileStorageFactory() = default;
+
+std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
+    const std::string &name) {
+  return impl_->CreateStorage(name);
+}
 
 FileStorage::FileStorage(const std::string &file_path):
-    data_changed_(false), file_path_(file_path) {}
+    loaded_(false), data_changed_(false), file_path_(file_path) {}
 
 int FileStorage::Load() {
   struct stat st;
@@ -47,6 +73,9 @@ int FileStorage::Load() {
     }
   }
   utils::WriteLock lock(&data_mutex_);
+  if (loaded_) {
+    return 0;
+  }
   int fd = open(file_path_.c_str(), O_RDONLY);
   if (fd < 0) {
     if (errno == ENOENT) {
@@ -118,13 +147,17 @@ int FileStorage::Load() {
                  << " failed, error code: " << strerror(errno);
     return -1;
   }
+  loaded_ = true;
   return 0;
 }
 
-void FileStorage::Clear() {
+bool FileStorage::Clear() {
   utils::WriteLock lock(&data_mutex_);
-  data_.clear();
-  data_changed_ = true;
+  if (!data_.empty()) {
+    data_.clear();
+    data_changed_ = true;
+  }
+  return true;
 }
 
 bool FileStorage::Insert(const std::string &key,
diff --git a/mace/core/file_storage.h b/mace/core/file_storage.h
index 3b648c23379c0502d5272ee720d52d7ae792b9b2..c4efe8c3565229b371a99f59fc71345c041577bf 100644
--- a/mace/core/file_storage.h
+++ b/mace/core/file_storage.h
@@ -16,27 +16,64 @@
 #define MACE_CORE_FILE_STORAGE_H_
 
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 
-#include "mace/public/mace_runtime.h"
+#include "mace/public/mace.h"
 #include "mace/utils/rwlock.h"
 
 namespace mace {
 
+class KVStorage {
+ public:
+  // return: 0 for success, -1 for error
+  virtual int Load() = 0;
+  virtual bool Clear() = 0;
+  // insert or update the key-value.
+  virtual bool Insert(const std::string &key,
+                      const std::vector<unsigned char> &value) = 0;
+  virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
+  // return: 0 for success, -1 for error
+  virtual int Flush() = 0;
+  virtual ~KVStorage() {}
+};
+
+class KVStorageFactory {
+ public:
+  virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
+
+  virtual ~KVStorageFactory() {}
+};
+
+class FileStorageFactory : public KVStorageFactory {
+ public:
+  // You have to make sure your APP have read and write permission of the path.
+  explicit FileStorageFactory(const std::string &path);
+
+  ~FileStorageFactory();
+
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
 class FileStorage : public KVStorage {
  public:
   explicit FileStorage(const std::string &file_path);
 
  public:
   int Load() override;
-  void Clear() override;
+  bool Clear() override;
   bool Insert(const std::string &key,
               const std::vector<unsigned char> &value) override;
   const std::vector<unsigned char> *Find(const std::string &key) override;
   int Flush() override;
 
  private:
+  bool loaded_;
   bool data_changed_;
   std::string file_path_;
   std::map<std::string, std::vector<unsigned char>> data_;
diff --git a/mace/core/net.cc b/mace/core/net.cc
index ec8afdd14f6a7abf8fbe4bd8d4b0bab4dd5e4e94..0c538b801bb1f9c8bcbbc109dc80fc0893255dfe 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -18,6 +18,7 @@
 
 #include "mace/core/macros.h"
 #include "mace/core/net.h"
+#include "mace/public/mace.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"
 #include "mace/utils/utils.h"
@@ -27,30 +28,35 @@ namespace mace {
 NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
                  const std::shared_ptr<const NetDef> net_def,
                  Workspace *ws,
-                 DeviceType type)
+                 Device *device)
     : name_(net_def->name()), op_registry_(op_registry) {
   MACE_UNUSED(ws);
-  MACE_UNUSED(type);
+  MACE_UNUSED(device);
 }
 
 SerialNet::SerialNet(
     const std::shared_ptr<const OperatorRegistryBase> op_registry,
     const std::shared_ptr<const NetDef> net_def,
     Workspace *ws,
-    DeviceType type,
+    Device *device,
     const NetMode mode)
-    : NetBase(op_registry, net_def, ws, type), device_type_(type) {
+    : NetBase(op_registry, net_def, ws, device), device_(device),
+      op_kernel_context_(new OpKernelContext(ws, device)) {
   MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
+  DeviceType device_type = device->device_type();
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
     const auto &operator_def = net_def->op(idx);
     // TODO(liuqi): refactor to add device_type to OperatorDef
     const int op_device =
         ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(device_type_));
-    if (op_device == type) {
+            operator_def, "device", static_cast<int>(device_type));
+    if (op_device == device_type) {
+      VLOG(3) << "Creating operator " << operator_def.name() << "("
+              << operator_def.type() << ")";
       OperatorDef temp_def(operator_def);
       std::unique_ptr<OperatorBase> op(
-          op_registry->CreateOperator(temp_def, ws, type, mode));
+          op_registry->CreateOperator(temp_def, op_kernel_context_.get(),
+                                      device_type, mode));
       if (op) {
         operators_.emplace_back(std::move(op));
       }
@@ -61,13 +67,14 @@ SerialNet::SerialNet(
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
   MACE_MEMORY_LOGGING_GUARD();
   MACE_LATENCY_LOGGER(1, "Running net");
+  const DeviceType device_type = device_->device_type();
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
     auto &op = *iter;
     MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
                         op->debug_def().type(), "), mem_id: ",
                         MakeListString(op->debug_def().mem_id().data(),
                                        op->debug_def().mem_id().size()));
-    bool future_wait = (device_type_ == DeviceType::GPU &&
+    bool future_wait = (device_type == DeviceType::GPU &&
                         (run_metadata != nullptr ||
                          std::distance(iter, operators_.end()) == 1));
 
@@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
       } else {
         future.wait_fn(nullptr);
       }
+#ifdef MACE_ENABLE_OPENCL
+      device_->opencl_runtime()->command_queue().finish();
+#endif
     } else if (run_metadata != nullptr) {
       call_stats.start_micros = NowMicros();
       MACE_RETURN_IF_ERROR(op->Run(nullptr));
@@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
     VLOG(3) << "Operator " << op->debug_def().name()
             << " has shape: " << MakeString(op->Output(0)->shape());
 
-    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type_ == CPU) {
+    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) {
       for (int i = 0; i < op->OutputSize(); ++i) {
         int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
         if (data_type == static_cast<int>(DT_FLOAT)) {
@@ -151,20 +161,20 @@ std::unique_ptr<NetBase> CreateNet(
     const std::shared_ptr<const OperatorRegistryBase> op_registry,
     const NetDef &net_def,
     Workspace *ws,
-    DeviceType type,
+    Device *device,
     const NetMode mode) {
   std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
-  return CreateNet(op_registry, tmp_net_def, ws, type, mode);
+  return CreateNet(op_registry, tmp_net_def, ws, device, mode);
 }
 
 std::unique_ptr<NetBase> CreateNet(
     const std::shared_ptr<const OperatorRegistryBase> op_registry,
     const std::shared_ptr<const NetDef> net_def,
     Workspace *ws,
-    DeviceType type,
+    Device *device,
     const NetMode mode) {
   std::unique_ptr<NetBase> net(
-      new SerialNet(op_registry, net_def, ws, type, mode));
+      new SerialNet(op_registry, net_def, ws, device, mode));
   return net;
 }
 
diff --git a/mace/core/net.h b/mace/core/net.h
index 0cec40594c5a12924ff3ee82595b12af4b6f689c..a63ded668e582e46d0d8a60b492478797a514cd5 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/public/mace.h"
 
 namespace mace {
 
@@ -33,7 +32,7 @@ class NetBase {
   NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
           const std::shared_ptr<const NetDef> net_def,
           Workspace *ws,
-          DeviceType type);
+          Device *device);
   virtual ~NetBase() noexcept {}
 
   virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;
@@ -52,14 +51,15 @@ class SerialNet : public NetBase {
   SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry,
             const std::shared_ptr<const NetDef> net_def,
             Workspace *ws,
-            DeviceType type,
+            Device *device,
             const NetMode mode = NetMode::NORMAL);
 
   MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
 
  protected:
   std::vector<std::unique_ptr<OperatorBase> > operators_;
-  DeviceType device_type_;
+  Device *device_;
+  std::unique_ptr<OpKernelContext> op_kernel_context_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
 };
@@ -68,13 +68,13 @@ std::unique_ptr<NetBase> CreateNet(
     const std::shared_ptr<const OperatorRegistryBase> op_registry,
     const NetDef &net_def,
     Workspace *ws,
-    DeviceType type,
+    Device *device,
     const NetMode mode = NetMode::NORMAL);
 std::unique_ptr<NetBase> CreateNet(
     const std::shared_ptr<const OperatorRegistryBase> op_registry,
     const std::shared_ptr<const NetDef> net_def,
     Workspace *ws,
-    DeviceType type,
+    Device *device,
     const NetMode mode = NetMode::NORMAL);
 
 }  // namespace mace
diff --git a/mace/core/op_kernel_context.cc b/mace/core/op_kernel_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20f9e561a43ea58179818fcf03989020bf6692a5
--- /dev/null
+++ b/mace/core/op_kernel_context.cc
@@ -0,0 +1,32 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/op_kernel_context.h"
+
+namespace mace {
+
+OpKernelContext::OpKernelContext(Workspace *ws, Device *device)
+    : device_(device), ws_(ws) {}
+
+OpKernelContext::~OpKernelContext() = default;
+
+Device* OpKernelContext::device() {
+  return device_;
+}
+
+Workspace* OpKernelContext::workspace() {
+  return ws_;
+}
+
+}  // namespace mace
diff --git a/mace/core/op_kernel_context.h b/mace/core/op_kernel_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe5e777cd5b5647ccb42f684fb1363224740333e
--- /dev/null
+++ b/mace/core/op_kernel_context.h
@@ -0,0 +1,34 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_
+#define MACE_CORE_OP_KERNEL_CONTEXT_H_
+
+#include "mace/core/device.h"
+#include "mace/core/workspace.h"
+namespace mace {
+
+class OpKernelContext {
+ public:
+  OpKernelContext(Workspace *ws, Device *device);
+  ~OpKernelContext();
+  Device *device();
+  Workspace *workspace();
+ private:
+  Device *device_;
+  Workspace *ws_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_OP_KERNEL_CONTEXT_H_
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 20769fa30bfb5e87eb21fdcbb0c3b98b98365570..5e4048358bfc7717da3a09e93899800750bb157a 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -18,12 +18,15 @@
 #include <vector>
 
 #include "mace/core/operator.h"
+#include "mace/core/op_kernel_context.h"
 
 namespace mace {
 
-OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
-    : operator_ws_(ws),
-      operator_def_(std::make_shared<OperatorDef>(operator_def)) {}
+OperatorBase::OperatorBase(const OperatorDef &operator_def,
+                           OpKernelContext *context)
+    : operator_def_(std::make_shared<OperatorDef>(operator_def)) {
+  MACE_UNUSED(context);
+}
 
 OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
 
@@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {}
 
 std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
     const OperatorDef &operator_def,
-    Workspace *ws,
+    OpKernelContext *context,
     DeviceType type,
     const NetMode mode) const {
   const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
@@ -70,7 +73,7 @@ std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
             .Device(type)
             .TypeConstraint("T", static_cast<DataType>(dtype))
             .Build(),
-        operator_def, ws);
+        operator_def, context);
   } else {
     return nullptr;
   }
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 330f8002288badec78de4d6987caff0d0762cb05..6be38890ebad2c18448ed01a69c57ff016ea2775 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -22,17 +22,17 @@
 
 #include "mace/core/arg_helper.h"
 #include "mace/core/future.h"
+#include "mace/core/op_kernel_context.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"
 
 namespace mace {
 
 class OperatorBase {
  public:
-  explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws);
+  explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *);
   virtual ~OperatorBase() noexcept {}
 
   template <typename T>
@@ -78,7 +78,6 @@ class OperatorBase {
   inline bool has_debug_def() const { return operator_def_ != nullptr; }
 
  protected:
-  Workspace *operator_ws_;
   std::shared_ptr<const OperatorDef> operator_def_;
   std::vector<const Tensor *> inputs_;
   std::vector<Tensor *> outputs_;
@@ -89,8 +88,9 @@ class OperatorBase {
 template <DeviceType D, class T>
 class Operator : public OperatorBase {
  public:
-  explicit Operator(const OperatorDef &operator_def, Workspace *ws)
-      : OperatorBase(operator_def, ws) {
+  explicit Operator(const OperatorDef &operator_def, OpKernelContext *context)
+      : OperatorBase(operator_def, context) {
+    Workspace *ws = context->workspace();
     for (const std::string &input_str : operator_def.input()) {
       const Tensor *tensor = ws->GetTensor(input_str);
       MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
@@ -116,7 +116,7 @@ class Operator : public OperatorBase {
           output_type = DataTypeToEnum<T>::v();
         }
         outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
-          output_str, GetDeviceAllocator(D), output_type)));
+          output_str, context->device()->allocator(), output_type)));
       }
     }
   }
@@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
 
 class OperatorRegistryBase {
  public:
-  typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
+  typedef Registry<std::string,
+                   OperatorBase,
+                   const OperatorDef &,
+                   OpKernelContext *>
       RegistryType;
   OperatorRegistryBase() = default;
   virtual ~OperatorRegistryBase();
   RegistryType *registry() { return &registry_; }
   std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
-                                               Workspace *ws,
+                                               OpKernelContext *context,
                                                DeviceType type,
                                                const NetMode mode) const;
 
@@ -183,7 +186,7 @@ class OperatorRegistryBase {
 MACE_DECLARE_REGISTRY(OpRegistry,
                       OperatorBase,
                       const OperatorDef &,
-                      Workspace *);
+                      OpKernelContext *);
 
 #define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
   MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
diff --git a/mace/core/registry.h b/mace/core/registry.h
index ac81328731c3178ee19798cae862086f25ed8c29..1ad92f0aab36fe130497209fe7ebe034f1e025ad 100644
--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -22,7 +22,6 @@
 #include <string>
 #include <vector>
 
-#include "mace/public/mace.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index f9b1d49f2f9dad0408a3b1922c12169444aa549c..671d4cdfdbae34f6c2a4026c6bfceceebe78292c 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -30,7 +30,6 @@
 #include "public/gemmlowp.h"
 #include "mace/core/macros.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h
index 3382a8f1c66de2b8fa41b3420b380efc91da5ab1..83d397ee88b39a9e31d72198bc56f950969168c9 100644
--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -18,7 +18,6 @@
 #include <vector>
 
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 
 namespace mace {
 
@@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
                                              CPUAffinityPolicy policy,
                                              bool use_gemmlowp = false);
 
+class CPURuntime {
+ public:
+  explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
+  ~CPURuntime() = default;
+  inline int num_threads() const {
+    return num_threads_;
+  }
+ private:
+  int num_threads_;
+};
 }  // namespace mace
 
 #endif  // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd9e41bb16db3151e77f1742f23c8866f427810f
--- /dev/null
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -0,0 +1,44 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/gpu_device.h"
+
+namespace mace {
+
+GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
+                     KVStorage *opencl_cache_storage,
+                     const GPUPriorityHint priority,
+                     const GPUPerfHint perf,
+                     KVStorage *opencl_binary_storage,
+                     const int num_threads) :
+    CPUDevice(num_threads),
+    runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
+                               opencl_binary_storage, tuner)),
+    allocator_(new OpenCLAllocator(runtime_.get())) {}
+
+GPUDevice::~GPUDevice() = default;
+
+OpenCLRuntime* GPUDevice::opencl_runtime() {
+  return runtime_.get();
+}
+
+Allocator* GPUDevice::allocator() {
+  return allocator_.get();
+}
+
+DeviceType GPUDevice::device_type() const {
+  return DeviceType::GPU;
+}
+
+}  // namespace mace
diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..1526ba0ae4ed7cb3b1170f89dc786da279c925bd
--- /dev/null
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -0,0 +1,44 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
+#define MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
+
+#include <memory>
+
+#include "mace/core/device_context.h"
+#include "mace/core/device.h"
+#include "mace/core/runtime/opencl/opencl_allocator.h"
+
+namespace mace {
+
+class GPUDevice : public CPUDevice {
+ public:
+  GPUDevice(Tuner<uint32_t> *tuner,
+            KVStorage *opencl_cache_storage = nullptr,
+            const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
+            const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
+            KVStorage *opencl_binary_storage = nullptr,
+            const int num_threads = -1);
+  ~GPUDevice();
+  OpenCLRuntime *opencl_runtime() override;
+  Allocator *allocator() override;
+  DeviceType device_type() const override;
+ private:
+  std::unique_ptr<OpenCLRuntime> runtime_;
+  std::unique_ptr<OpenCLAllocator> allocator_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index 86b0138d727da41171c315fde3e121d88877fb04..c22e4f8f5663a27b9596915dad1f64864c8f3ec9 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
+
 #include "mace/core/runtime/opencl/opencl_allocator.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 
 namespace mace {
@@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 }
 }  // namespace
 
-OpenCLAllocator::OpenCLAllocator() {}
+OpenCLAllocator::OpenCLAllocator(
+    OpenCLRuntime *opencl_runtime):
+    opencl_runtime_(opencl_runtime) {}
 
 OpenCLAllocator::~OpenCLAllocator() {}
 MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
@@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
   }
 
   cl_int error;
-  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
+  cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(),
                                       CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                       nbytes, nullptr, &error);
   if (error != CL_SUCCESS) {
@@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
 
   cl_int error;
   cl::Image2D *cl_image =
-      new cl::Image2D(OpenCLRuntime::Global()->context(),
+      new cl::Image2D(opencl_runtime_->context(),
                       CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
                       image_shape[0], image_shape[1], 0, nullptr, &error);
   if (error != CL_SUCCESS) {
@@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
 }
 
 void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
+  VLOG(3) << "Map OpenCL buffer";
   auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Global()->command_queue();
+  auto queue = opencl_runtime_->command_queue();
   // TODO(heliangliang) Non-blocking call
   cl_int error;
   void *mapped_ptr =
@@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
 void *OpenCLAllocator::MapImage(void *buffer,
                                 const std::vector<size_t> &image_shape,
                                 std::vector<size_t> *mapped_image_pitch) const {
-  MACE_CHECK(image_shape.size() == 2, "Just support map 2d image");
+  VLOG(3) << "Map OpenCL Image";
+  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
   auto cl_image = static_cast<cl::Image2D *>(buffer);
   std::array<size_t, 3> origin = {0, 0, 0};
   std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
 
   mapped_image_pitch->resize(2);
   cl_int error;
-  void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
+  void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage(
       *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
       mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
       nullptr, &error);
@@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer,
 }
 
 void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
+  VLOG(3) << "Unmap OpenCL buffer/Image";
   auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Global()->command_queue();
+  auto queue = opencl_runtime_->command_queue();
   cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
                                              nullptr, nullptr);
   if (error != CL_SUCCESS) {
diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h
index 6304add8583f7b2e47c58cd6e6b186ea43b7f092..d2b7556beb09086ca8091dbd70eb4566c62414a6 100644
--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -15,15 +15,17 @@
 #ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
 #define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
 
+#include <memory>
 #include <vector>
 
 #include "mace/core/allocator.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 
 namespace mace {
 
 class OpenCLAllocator : public Allocator {
  public:
-  OpenCLAllocator();
+  explicit OpenCLAllocator(OpenCLRuntime *opencl_runtime);
 
   ~OpenCLAllocator() override;
 
@@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator {
   void Unmap(void *buffer, void *mapped_ptr) const override;
 
   bool OnHost() const override;
+
+ private:
+  OpenCLRuntime *opencl_runtime_;
 };
 
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 305337034d98622db928d9133c4cc69597900ffe..967a040f01395d1ea13b25d6c1a1c67650a95c3b 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -24,11 +24,9 @@
 #include <vector>
 #include <utility>
 
-#include "mace/public/mace_runtime.h"
 #include "mace/core/macros.h"
 #include "mace/core/file_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
-#include "mace/public/mace.h"
 #include "mace/utils/tuner.h"
 
 namespace mace {
@@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector<std::string> &paths) {
 
 const char *kOpenCLPlatformInfoKey =
     "mace_opencl_precompiled_platform_info_key";
-const char *kPrecompiledProgramFileName =
-    "mace_cl_compiled_program.bin";
 }  // namespace
 
 void OpenCLProfilingTimer::StartTiming() {}
 
 void OpenCLProfilingTimer::StopTiming() {
-  OpenCLRuntime::Global()->command_queue().finish();
+  runtime_->command_queue().finish();
   start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
   stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
 }
@@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() {
   accumulated_micros_ = 0;
 }
 
-GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL;
-GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
-    GPUPriorityHint::PRIORITY_DEFAULT;
-std::string
-    OpenCLRuntime::kPrecompiledBinaryPath = "";  // NOLINT(runtime/string)
-
-OpenCLRuntime *OpenCLRuntime::Global() {
-  static OpenCLRuntime runtime;
-  return &runtime;
-}
-
-void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
-                              GPUPriorityHint gpu_priority_hint) {
-  OpenCLRuntime::kGPUPerfHint = gpu_perf_hint;
-  OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
-}
-
-void OpenCLRuntime::ConfigureOpenCLBinaryPath(
-    const std::vector<std::string> &paths) {
-  OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths);
-  if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
-    LOG(WARNING) << "There is no precompiled OpenCL binary file in "
-                 << MakeString(paths);
-  }
-}
-
-OpenCLRuntime::OpenCLRuntime():
-    precompiled_binary_storage_(nullptr),
-    cache_storage_(nullptr),
+OpenCLRuntime::OpenCLRuntime(
+    KVStorage *cache_storage,
+    const GPUPriorityHint priority_hint,
+    const GPUPerfHint perf_hint,
+    KVStorage *precompiled_binary_storage,
+    Tuner<uint32_t> *tuner):
+    cache_storage_(cache_storage),
+    precompiled_binary_storage_(precompiled_binary_storage),
+    tuner_(tuner),
     is_opencl_avaliable_(false),
     is_profiling_enabled_(false),
     opencl_version_(CL_VER_UNKNOWN),
@@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime():
   cl_command_queue_properties properties = 0;
 
   const char *profiling = getenv("MACE_OPENCL_PROFILING");
-  if (Tuner<uint32_t>::Get()->IsTuning() ||
+  if (IsTuning() ||
       (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
     properties |= CL_QUEUE_PROFILING_ENABLE;
     is_profiling_enabled_ = true;
@@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime():
     std::vector<cl_context_properties> context_properties;
     context_properties.reserve(5);
     GetAdrenoContextProperties(&context_properties,
-                               OpenCLRuntime::kGPUPerfHint,
-                               OpenCLRuntime::kGPUPriorityHint);
+                               perf_hint,
+                               priority_hint);
     context_ = std::shared_ptr<cl::Context>(
         new cl::Context({*device_}, context_properties.data(),
                         nullptr, nullptr, &err));
@@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime():
     return;
   }
 
-  extern std::shared_ptr<KVStorageFactory> kStorageFactory;
   std::string cached_binary_platform_info;
-  if (kStorageFactory != nullptr) {
-    cache_storage_ =
-        kStorageFactory->CreateStorage(kPrecompiledProgramFileName);
-
+  if (cache_storage_ != nullptr) {
     if (cache_storage_->Load() != 0) {
       LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. "
                    << "Please make sure the storage directory exist "
@@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime():
   }
 
   if (cached_binary_platform_info != platform_info_) {
-    if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
-      precompiled_binary_storage_.reset(
-          new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
+    if (precompiled_binary_storage_ == nullptr) {
+      VLOG(1) << "There is no precompiled OpenCL binary in"
+          " all OpenCL binary paths.";
+    } else {
       if (precompiled_binary_storage_->Load() != 0) {
         LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
                      << "Please make sure the storage directory exist "
@@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
 
 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
 
+Tuner<uint32_t> *OpenCLRuntime::tuner() { return tuner_; }
+
 uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
   return device_gloabl_mem_cache_size_;
 }
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 537707fa654fea9dce01c48297d802cdbc27bd2a..222fe8514a4cf4b08c944959e2faf8f646bf5c29 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -22,11 +22,12 @@
 #include <string>
 #include <vector>
 
+#include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
+#include "mace/utils/tuner.h"
 
 namespace mace {
 
@@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error);
     return MaceStatus::MACE_OUT_OF_RESOURCES;               \
   }
 
-class OpenCLProfilingTimer : public Timer {
- public:
-  explicit OpenCLProfilingTimer(const cl::Event *event)
-      : event_(event), accumulated_micros_(0) {}
-  void StartTiming() override;
-  void StopTiming() override;
-  void AccumulateTiming() override;
-  void ClearTiming() override;
-  double ElapsedMicros() override;
-  double AccumulatedMicros() override;
-
- private:
-  const cl::Event *event_;
-  double start_nanos_;
-  double stop_nanos_;
-  double accumulated_micros_;
-};
-
 class OpenCLRuntime {
  public:
-  static OpenCLRuntime *Global();
-  static void Configure(GPUPerfHint, GPUPriorityHint);
-  static void ConfigureOpenCLBinaryPath(const std::vector<std::string> &paths);
+  OpenCLRuntime(
+      KVStorage *cache_storage = nullptr,
+      const GPUPriorityHint priority_hint = GPUPriorityHint::PRIORITY_NORMAL,
+      const GPUPerfHint perf_hint = GPUPerfHint::PERF_NORMAL,
+      KVStorage *precompiled_binary_storage = nullptr,
+      Tuner<uint32_t> *tuner = nullptr);
+  ~OpenCLRuntime();
+  OpenCLRuntime(const OpenCLRuntime &) = delete;
+  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
 
   cl::Context &context();
   cl::Device &device();
@@ -91,6 +80,7 @@ class OpenCLRuntime {
   const std::string platform_info() const;
   uint64_t device_global_mem_cache_size() const;
   uint32_t device_compute_units() const;
+  Tuner<uint32_t> *tuner();
   bool is_opencl_avaliable();
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
@@ -112,11 +102,6 @@ class OpenCLRuntime {
   void SaveBuiltCLProgram();
 
  private:
-  OpenCLRuntime();
-  ~OpenCLRuntime();
-  OpenCLRuntime(const OpenCLRuntime &) = delete;
-  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
-
   bool BuildProgram(const std::string &program_file_name,
                     const std::string &binary_file_name,
                     const std::string &build_options,
@@ -137,10 +122,13 @@ class OpenCLRuntime {
   OpenCLVersion ParseDeviceVersion(const std::string &device_version);
 
  private:
-  std::unique_ptr<KVStorage> precompiled_binary_storage_;
-  std::unique_ptr<KVStorage> cache_storage_;
+  KVStorage *cache_storage_;
+  KVStorage *precompiled_binary_storage_;
+  Tuner<uint32_t> *tuner_;
   bool is_opencl_avaliable_;
   bool is_profiling_enabled_;
+  OpenCLVersion opencl_version_;
+  GPUType gpu_type_;
   // All OpenCL object must be a pointer and manually deleted before unloading
   // OpenCL library.
   std::shared_ptr<cl::Context> context_;
@@ -149,18 +137,30 @@ class OpenCLRuntime {
   std::map<std::string, cl::Program> built_program_map_;
   std::mutex program_build_mutex_;
   std::string platform_info_;
-  OpenCLVersion opencl_version_;
   std::string precompiled_binary_platform_info_;
   bool out_of_range_check_;
   uint64_t device_gloabl_mem_cache_size_;
   uint32_t device_compute_units_;
-  GPUType gpu_type_;
-
-  static GPUPerfHint kGPUPerfHint;
-  static GPUPriorityHint kGPUPriorityHint;
-  static std::string kPrecompiledBinaryPath;
 };
 
+class OpenCLProfilingTimer : public Timer {
+ public:
+  OpenCLProfilingTimer(OpenCLRuntime *runtime, const cl::Event *event)
+      : runtime_(runtime), event_(event), accumulated_micros_(0) {}
+  void StartTiming() override;
+  void StopTiming() override;
+  void AccumulateTiming() override;
+  void ClearTiming() override;
+  double ElapsedMicros() override;
+  double AccumulatedMicros() override;
+
+ private:
+  OpenCLRuntime *runtime_;
+  const cl::Event *event_;
+  double start_nanos_;
+  double stop_nanos_;
+  double accumulated_micros_;
+};
 }  // namespace mace
 
 #endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 62ea5488a87f53233c049915c8170ff8eb41d709..f7e509876f1564b06cbcd94e433a8ca3c03e197f 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -25,7 +25,6 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
 #endif
-#include "mace/public/mace.h"
 #include "mace/utils/logging.h"
 
 #ifdef MACE_ENABLE_NEON
@@ -38,10 +37,10 @@
 namespace mace {
 
 #define MACE_SINGLE_ARG(...) __VA_ARGS__
-#define MACE_CASE(TYPE, STATEMENTS)             \
+#define MACE_CASE(TYPE, STATEMENTS)   \
   case DataTypeToEnum<TYPE>::value: { \
     typedef TYPE T;                   \
-    STATEMENTS;                            \
+    STATEMENTS;                       \
     break;                            \
   }
 
@@ -137,7 +136,7 @@ class Tensor {
     buffer_ = &buffer_slice_;
   }
 
-  Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}
+  Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}
 
   ~Tensor() {
     if (is_buffer_owner_ && buffer_ != nullptr) {
@@ -270,7 +269,7 @@ class Tensor {
     image_shape_ = image_shape;
     if (buffer_ == nullptr) {
       MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Image();
+      buffer_ = new Image(allocator_);
       return buffer_->Allocate(image_shape, dtype_);
     } else {
       MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc
index 48a6928d40ad0ddf755ebcc311f9a125756e47e7..569a8345c147a763a2c2036b4ac082e60caed856 100644
--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -16,15 +16,10 @@
 
 #include "gflags/gflags.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 
 DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
-DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
 DEFINE_int32(cpu_affinity_policy, 1,
              "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
@@ -43,10 +38,6 @@ int main(int argc, char **argv) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
 
-  mace::OpenCLRuntime::Configure(
-      static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint),
-      static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint));
-
   mace::testing::Benchmark::Run(FLAGS_filter.c_str());
   return 0;
 }
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 07d855605ed744d64345ab722225a274bc09063c..4c9204cbf085acda7f4a9497da2a5c80afab88f0 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/workspace.h"
+
+#include <memory>
 #include <string>
 #include <vector>
 #include <unordered_set>
@@ -21,8 +24,6 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif
-#include "mace/core/workspace.h"
-#include "mace/utils/timer.h"
 
 namespace mace {
 
@@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
 }
 }  // namespace
 
-Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer(
-  GetDeviceAllocator(DeviceType::CPU))) {}
+Workspace::Workspace() :
+    host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
 
 Tensor *Workspace::CreateTensor(const std::string &name,
                                 Allocator *alloc,
@@ -74,7 +75,7 @@ std::vector<std::string> Workspace::Tensors() const {
 }
 
 MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
-                                      DeviceType type,
+                                      Device *device,
                                       const unsigned char *model_data) {
   MACE_LATENCY_LOGGER(1, "Load model tensors");
   index_t model_data_size = 0;
@@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
   }
   VLOG(3) << "Model data size: " << model_data_size;
 
+  const DeviceType device_type = device->device_type();
+
   if (model_data_size > 0) {
 #ifdef MACE_ENABLE_OPENCL
-    if (type == DeviceType::GPU &&
-        OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
+    if (device_type == DeviceType::GPU &&
+        device->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
             static_cast<uint64_t>(model_data_size)) {
       for (auto &const_tensor : net_def.tensors()) {
         MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
@@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
         }
 
         std::unique_ptr<Tensor> tensor(
-            new Tensor(GetDeviceAllocator(type),
+            new Tensor(device->allocator(),
                        const_tensor.data_type(), true));
         tensor->Resize(dims);
 
@@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 #else
     {
 #endif
-      if (type == DeviceType::CPU) {
+      if (device_type == DeviceType::CPU) {
         tensor_buffer_ = std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(type),
+            new Buffer(device->allocator(),
                        const_cast<unsigned char*>(model_data),
                        model_data_size));
       } else {
         tensor_buffer_ = std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(type)));
+            new Buffer(device->allocator()));
         MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
         tensor_buffer_->Map(nullptr);
         tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
@@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
     }
   }
 
-  if (type == DeviceType::CPU || type == DeviceType::GPU) {
-    MaceStatus status = CreateOutputTensorBuffer(net_def, type);
+  if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
+    MaceStatus status = CreateOutputTensorBuffer(net_def, device);
     if (status != MaceStatus::MACE_SUCCESS) return status;
   }
 
-  if (type == DeviceType::CPU && net_def.has_quantize_info()) {
+  if (device_type == DeviceType::CPU && net_def.has_quantize_info()) {
     for (const auto
           &activation_info: net_def.quantize_info().activation_info()) {
       if (HasTensor(activation_info.tensor_name())) {
@@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 }
 
 MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                               DeviceType device_type) {
+                                               Device *device) {
+  DeviceType device_type = device->device_type();
   DataType dtype = DataType::DT_INVALID;
   if (net_def.mem_arena().mem_block_size() > 0) {
     // We use the data type of the first op with mem id,
@@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
               << ", memory type: " << mem_block.mem_type();
       if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
         std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(DeviceType::CPU)));
+            new Buffer(GetCPUAllocator()));
         MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
             mem_block.x() * GetEnumTypeSize(dtype)
                 + MACE_EXTRA_BUFFER_PAD_SIZE));
@@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                                           std::move(tensor_buf));
       } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
         std::unique_ptr<BufferBase> image_buf(
-            new Image());
+            new Image(device->allocator()));
         MACE_RETURN_IF_ERROR(image_buf->Allocate(
             {mem_block.x(), mem_block.y()}, dtype));
         preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                           std::move(image_buf));
       } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
         std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU)));
+            new Buffer(device->allocator()));
         MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
             mem_block.x() * GetEnumTypeSize(dtype)));
         preallocated_allocator_.SetBuffer(mem_block.mem_id(),
@@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                 op, "T", static_cast<int>(DT_FLOAT)));
           }
           CreateTensor(op.output(i),
-                       GetDeviceAllocator(device_type),
+                       device->allocator(),
                        output_type);
         }
       }
@@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() {
 }
 
 void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
-                                      const unsigned char *model_data) {
+                                      const unsigned char *model_data,
+                                      Allocator *alloc) {
   for (auto &const_tensor : net_def.tensors()) {
     auto iter = tensor_map_.find(const_tensor.name());
     if (iter->second->unused()) {
@@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
         dims.push_back(d);
       }
       std::unique_ptr<Tensor> tensor(
-          new Tensor(GetDeviceAllocator(DeviceType::GPU),
-                     const_tensor.data_type()));
+          new Tensor(alloc, const_tensor.data_type()));
       tensor->Resize(dims);
       MACE_CHECK(tensor->size() == const_tensor.data_size(),
                  "Tensor's data_size not equal with the shape");
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 20f214b0018a93b59b84d8bf4cae7004e4e6ba0d..71850098e03593083454acc7102743a0cc106f1b 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <memory>
 
+#include "mace/core/device.h"
 #include "mace/core/preallocated_pooled_allocator.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
@@ -48,7 +49,7 @@ class Workspace {
   std::vector<std::string> Tensors() const;
 
   MaceStatus LoadModelTensor(const NetDef &net_def,
-                             DeviceType type,
+                             Device *device,
                              const unsigned char *model_data);
 
   ScratchBuffer *GetScratchBuffer(DeviceType device_type);
@@ -56,11 +57,14 @@ class Workspace {
   void RemoveUnusedBuffer();
 
   void RemoveAndReloadBuffer(const NetDef &net_def,
-                             const unsigned char *model_data);
+                             const unsigned char *model_data,
+                             Allocator *alloc);
 
  private:
   MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
-                                      DeviceType device_type);
+                                      Device *device);
+
+  Device *device_;
 
   TensorMap tensor_map_;
 
diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java
index edd7bf1802012012a0881fb82e00924a1c109405..5788801c0277a1d24834dd6d8470cd2d02c8f939 100644
--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java
@@ -37,15 +37,13 @@ public class AppModel {
         mJniThread = new Handler(thread.getLooper());
     }
 
-    public void maceMobilenetSetAttrs(final InitData initData) {
+    public void maceMobilenetCreateGPUContext(final InitData initData) {
         mJniThread.post(new Runnable() {
             @Override
             public void run() {
-                int result = JniMaceUtils.maceMobilenetSetAttrs(
-                        initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
-                        initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
-                        initData.getKernelPath());
-                Log.i("APPModel", "maceMobilenetSetAttrs result = " + result);
+                int result = JniMaceUtils.maceMobilenetCreateGPUContext(
+                        initData.getStoragePath());
+                Log.i("APPModel", "maceMobilenetCreateGPUContext result = " + result);
             }
         });
     }
@@ -54,7 +52,10 @@ public class AppModel {
         mJniThread.post(new Runnable() {
             @Override
             public void run() {
-                int result = JniMaceUtils.maceMobilenetCreateEngine(initData.getModel(), initData.getDevice());
+                int result = JniMaceUtils.maceMobilenetCreateEngine(
+                        initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
+                        initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
+                        initData.getModel(), initData.getDevice());
                 Log.i("APPModel", "maceMobilenetCreateEngine result = " + result);
 
                 if (result == -1) {
diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java
index ab62a90fcf9227398b7fb8ab159a09ec3984aed1..f8adafc845c37e341e084fa208085dfaeaf4c3a2 100644
--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java
@@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap
     }
 
     private void initJni() {
-        AppModel.instance.maceMobilenetSetAttrs(initData);
+        AppModel.instance.maceMobilenetCreateGPUContext(initData);
         AppModel.instance.maceMobilenetCreateEngine(initData, this);
     }
 
diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java
index ab0f54b587bc9bbcd574c12b6f449172cade1689..ffcbde9605a841f959fcec3ba5016618f004e223 100644
--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java
@@ -29,7 +29,7 @@ public class InitData {
     private int cpuAffinityPolicy;
     private int gpuPerfHint;
     private int gpuPriorityHint;
-    private String kernelPath = "";
+    private String storagePath = "";
 
     public InitData() {
         model = MODELS[0];
@@ -38,8 +38,8 @@ public class InitData {
         gpuPerfHint = 3;
         gpuPriorityHint = 3;
         device = DEVICES[0];
-        kernelPath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
-        File file = new File(kernelPath);
+        storagePath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
+        File file = new File(storagePath);
         if (!file.exists()) {
             file.mkdir();
         }
@@ -94,11 +94,11 @@ public class InitData {
         this.gpuPriorityHint = gpuPriorityHint;
     }
 
-    public String getKernelPath() {
-        return kernelPath;
+    public String getStoragePath() {
+        return storagePath;
     }
 
-    public void setKernelPath(String kernelPath) {
-        this.kernelPath = kernelPath;
+    public void setStoragePath(String storagePath) {
+        this.storagePath = storagePath;
     }
 }
diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
index 0a0702702ae0f18ad31998fa562e0d0fa7237d16..4ccba56efa6b1a8de476eb6c7c7e00632136099f 100755
--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
@@ -26,7 +26,6 @@
 #include <numeric>
 
 #include "src/main/cpp/include/mace/public/mace.h"
-#include "src/main/cpp/include/mace/public/mace_runtime.h"
 #include "src/main/cpp/include/mace/public/mace_engine_factory.h"
 
 namespace {
@@ -39,8 +38,8 @@ struct ModelInfo {
 };
 
 struct MaceContext {
+  std::shared_ptr<mace::GPUContext> gpu_context;
   std::shared_ptr<mace::MaceEngine> engine;
-  std::shared_ptr<mace::KVStorageFactory> storage_factory;
   std::string model_name;
   mace::DeviceType device_type = mace::DeviceType::CPU;
   std::map<std::string, ModelInfo> model_infos = {
@@ -72,48 +71,65 @@ MaceContext& GetMaceContext() {
 
 }  // namespace
 
-JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs(
-    JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
-    jint gpu_perf_hint, jint gpu_priority_hint, jstring kernel_path) {
+JNIEXPORT jint JNICALL
+Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(
+    JNIEnv *env, jclass thisObj, jstring storage_path) {
   MaceContext &mace_context = GetMaceContext();
+  // DO NOT USE tmp directory.
+  // Please use APP's own directory and make sure the directory exists.
+  const char *storage_path_ptr = env->GetStringUTFChars(storage_path, nullptr);
+  if (storage_path_ptr == nullptr) return JNI_ERR;
+  const std::string storage_file_path(storage_path_ptr);
+  env->ReleaseStringUTFChars(storage_path, storage_path_ptr);
 
-  mace::MaceStatus status;
-      // openmp
-  status = mace::SetOpenMPThreadPolicy(
-      omp_num_threads,
-      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
-
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "openmp result: %d, threads: %d, cpu: %d",
-                      status, omp_num_threads, cpu_affinity_policy);
-
-  //  gpu
-  mace::SetGPUHints(
-      static_cast<mace::GPUPerfHint>(gpu_perf_hint),
-      static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
-
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "gpu perf: %d, priority: %d",
-                      gpu_perf_hint, gpu_priority_hint);
-
-  //  opencl cache
-  const char *kernel_path_ptr = env->GetStringUTFChars(kernel_path, nullptr);
-  if (kernel_path_ptr == nullptr) return JNI_ERR;
-  const std::string kernel_file_path(kernel_path_ptr);
-  mace_context.storage_factory.reset(
-      new mace::FileStorageFactory(kernel_file_path));
-  mace::SetKVStorageFactory(mace_context.storage_factory);
-  env->ReleaseStringUTFChars(kernel_path, kernel_path_ptr);
+  mace_context.gpu_context = mace::GPUContextBuilder()
+      .SetStoragePath(storage_file_path)
+      .Finalize();
 
   return JNI_OK;
 }
 
 JNIEXPORT jint JNICALL
 Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
-    JNIEnv *env, jclass thisObj, jstring model_name_str, jstring device) {
+    JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
+    jint gpu_perf_hint, jint gpu_priority_hint,
+    jstring model_name_str, jstring device) {
   MaceContext &mace_context = GetMaceContext();
+
+  // get device
+  const char *device_ptr = env->GetStringUTFChars(device, nullptr);
+  if (device_ptr == nullptr) return JNI_ERR;
+  mace_context.device_type = ParseDeviceType(device_ptr);
+  env->ReleaseStringUTFChars(device, device_ptr);
+
+  // create MaceEngineConfig
+  mace::MaceStatus status;
+  mace::MaceEngineConfig config(mace_context.device_type);
+  status = config.SetCPUThreadPolicy(
+      omp_num_threads,
+      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
+  if (status != mace::MACE_SUCCESS) {
+    __android_log_print(ANDROID_LOG_ERROR,
+                        "image_classify attrs",
+                        "openmp result: %d, threads: %d, cpu: %d",
+                        status, omp_num_threads, cpu_affinity_policy);
+  }
+  if (mace_context.device_type == mace::DeviceType::GPU) {
+    config.SetGPUContext(mace_context.gpu_context);
+    config.SetGPUHints(
+        static_cast<mace::GPUPerfHint>(gpu_perf_hint),
+        static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
+    __android_log_print(ANDROID_LOG_INFO,
+                        "image_classify attrs",
+                        "gpu perf: %d, priority: %d",
+                        gpu_perf_hint, gpu_priority_hint);
+  }
+
+  __android_log_print(ANDROID_LOG_INFO,
+                      "image_classify attrs",
+                      "device: %d",
+                      mace_context.device_type);
+
   //  parse model name
   const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr);
   if (model_name_ptr == nullptr) return JNI_ERR;
@@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
   std::vector<std::string> input_names = {model_info_iter->second.input_name};
   std::vector<std::string> output_names = {model_info_iter->second.output_name};
 
-  // get device
-  const char *device_ptr = env->GetStringUTFChars(device, nullptr);
-  if (device_ptr == nullptr) return JNI_ERR;
-  mace_context.device_type = ParseDeviceType(device_ptr);
-  env->ReleaseStringUTFChars(device, device_ptr);
-
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "device: %d",
-                      mace_context.device_type);
-
   mace::MaceStatus create_engine_status =
       CreateMaceEngineFromCode(mace_context.model_name,
                                std::string(),
                                input_names,
                                output_names,
-                               mace_context.device_type,
+                               config,
                                &mace_context.engine);
 
-  __android_log_print(ANDROID_LOG_ERROR,
+  __android_log_print(ANDROID_LOG_INFO,
                       "image_classify attrs",
                       "create result: %d",
                       create_engine_status);
diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h
index bef7417bcb945c50bfad6673f1a11eb3b551387e..5114eb911af090f3b39ea45c931314530ba1e3ca 100644
--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h
@@ -24,11 +24,13 @@ extern "C" {
 #endif
 /*
  * Class:     com_xiaomi_mace_JniMaceUtils
- * Method:    maceMobilenetSetAttrs
+ * Method:    maceMobilenetCreateGPUContext
  * Signature: (Ljava/lang/String;IIIILjava/lang/String;)I
  */
-JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
-  (JNIEnv *, jclass, jint, jint, jint, jint, jstring);
+JNIEXPORT jint JNICALL
+Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(JNIEnv *,
+                                                                jclass,
+                                                                jstring);
 
 /*
  * Class:     com_xiaomi_mace_JniMaceUtils
@@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
  */
 JNIEXPORT jint JNICALL
 Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine
-  (JNIEnv *, jclass, jstring, jstring);
+  (JNIEnv *, jclass, jint, jint, jint, jint, jstring, jstring);
 
 /*
  * Class:     com_xiaomi_mace_JniMaceUtils
diff --git a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java
index f9ab7a7af7c9358dee896ec9b5bec5152f516fce..e776c013189f64b902feec7592fe53b5f0c41308 100644
--- a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java
+++ b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java
@@ -20,9 +20,9 @@ public class JniMaceUtils {
         System.loadLibrary("mace_mobile_jni");
     }
 
-    public static native int maceMobilenetSetAttrs(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String kernelPath);
+    public static native int maceMobilenetCreateGPUContext(String storagePath);
 
-    public static native int maceMobilenetCreateEngine(String model, String device);
+    public static native int maceMobilenetCreateEngine(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String model, String device);
 
     public static native float[] maceMobilenetClassify(float[] input);
 
diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc
index 4892baf2e3a1dbc69f688b76a34f7d7a7a21f7dd..99436fa4876bcb1731dab73aac39f70ea8ef136a 100644
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -21,7 +21,6 @@
 
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 // if convert model to code.
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -157,40 +156,40 @@ bool RunModel(const std::vector<std::string> &input_names,
               const std::vector<std::vector<int64_t>> &output_shapes) {
   // load model
   DeviceType device_type = ParseDeviceType(FLAGS_device);
-  // config runtime
-  mace::SetOpenMPThreadPolicy(
+  // configuration
+  // Detailed information please see mace.h
+  MaceStatus status;
+  MaceEngineConfig config(device_type);
+  status = config.SetCPUThreadPolicy(
       FLAGS_omp_num_threads,
       static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+  if (status != MACE_SUCCESS) {
+    std::cerr << "Set openmp or cpu affinity failed." << std::endl;
+  }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
   if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-
-    // Just call once. (Not thread-safe)
-    // Set paths of Generated OpenCL Compiled Kernel Binary file
-    // if you build gpu library of specific soc.
-    // Using OpenCL binary will speed up the initialization.
-    // OpenCL binary is corresponding to the OpenCL Driver version,
-    // you should update the binary when OpenCL Driver changed.
+    // DO NOT USE tmp directory.
+    // Please use APP's own directory and make sure the directory exists.
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
+    const std::string storage_path =
+        std::string(storage_path_ptr == nullptr ?
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
     std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
 
-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
   }
 #endif  // MACE_ENABLE_OPENCL
 
-  // DO NOT USE tmp directory.
-  // Please use APP's own directory and make sure the directory exists.
-  // Just call once
-  const std::string internal_storage_path =
-      "/data/local/tmp/mace_run/interior";
-
-  // Config internal kv storage factory.
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(internal_storage_path));
-  SetKVStorageFactory(storage_factory);
-
   // Create Engine
   std::shared_ptr<mace::MaceEngine> engine;
   MaceStatus create_engine_status;
@@ -204,7 +203,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                                FLAGS_model_data_file,
                                input_names,
                                output_names,
-                               device_type,
+                               config,
                                &engine);
 #else
   std::vector<unsigned char> model_pb_data;
@@ -216,7 +215,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                                 FLAGS_model_data_file,
                                 input_names,
                                 output_names,
-                                device_type,
+                                config,
                                 &engine);
 #endif
 
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 51383ad44fe0c737acc3229287921e7c47b0173e..3159684d109b96c09e3909ffc296f083c0278a4d 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -126,10 +127,14 @@ template <DeviceType D, typename T>
 class ActivationFunctor;
 
 template <>
-class ActivationFunctor<DeviceType::CPU, float> {
+class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
  public:
-  ActivationFunctor(ActivationType type, float relux_max_limit)
-      : activation_(type), relux_max_limit_(relux_max_limit) {}
+  ActivationFunctor(OpKernelContext *context,
+                    ActivationType type,
+                    float relux_max_limit)
+      : OpKernel(context),
+        activation_(type),
+        relux_max_limit_(relux_max_limit) {}
 
   MaceStatus operator()(const Tensor *input,
                         const Tensor *alpha,
@@ -159,10 +164,14 @@ class ActivationFunctor<DeviceType::CPU, float> {
 
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class ActivationFunctor<DeviceType::GPU, T> {
+class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
  public:
-  ActivationFunctor(ActivationType type, T relux_max_limit)
-      : activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}
+  ActivationFunctor(OpKernelContext *context,
+                    ActivationType type,
+                    T relux_max_limit)
+      : OpKernel(context),
+        activation_(type),
+        relux_max_limit_(static_cast<T>(relux_max_limit)) {}
 
   MaceStatus operator()(const Tensor *input,
                         const Tensor *alpha,
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 2215343f5c092870f4a400061afc857c96f3a465..d81f25a337410d1225f9d8e49e071e496372d79a 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -24,6 +24,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -35,10 +36,11 @@ namespace kernels {
 constexpr int kCostPerGroup = 1024;
 
 template <DeviceType D, typename T>
-struct AddNFunctor {
+struct AddNFunctor : OpKernel {
+  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
-                  Tensor *output_tensor,
-                  StatsFuture *future) {
+                        Tensor *output_tensor,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
     index_t size = output_tensor->size();
@@ -95,7 +97,8 @@ struct AddNFunctor {
 
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct AddNFunctor<DeviceType::GPU, T> {
+struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
                   Tensor *output_tensor,
                   StatsFuture *future);
diff --git a/mace/kernels/argmax.h b/mace/kernels/argmax.h
index 54edc3ee7b718a69b7b7136dbba587f07d654997..36218d627fce5f220cd89120728e73887155fb16 100644
--- a/mace/kernels/argmax.h
+++ b/mace/kernels/argmax.h
@@ -23,6 +23,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 
@@ -30,7 +31,8 @@ namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct ArgMaxFunctor {
+struct ArgMaxFunctor : OpKernel {
+  explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const Tensor *input,
                         const Tensor *axis,
                         Tensor *output,
diff --git a/mace/kernels/arm/conv_winograd_test.cc b/mace/kernels/arm/conv_winograd_test.cc
index 166b67a5e4a33fd5165d4b3b8ec7de9ed0f683d4..1313543220580b896965bc3ef240e31b6edc3b09 100644
--- a/mace/kernels/arm/conv_winograd_test.cc
+++ b/mace/kernels/arm/conv_winograd_test.cc
@@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) {
   index_t filter_size = 3 * 3 * in_channels * out_channels;
   index_t output_size = batch * out_channels * out_height * out_width;
 
-  Tensor input;
-  Tensor filter;
-  Tensor output;
-  Tensor output_ref;
+  Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);
 
   input.Resize({batch, in_channels, in_height, in_width});
   filter.Resize({out_channels, in_channels, 3, 3});
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 6f934e6b14484475c96d6ffa34ce43deb3e0ebbe..4c9aac3a6415fdd8bc60e1af34ca3d51e9ca9a12 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -33,11 +33,13 @@
 namespace mace {
 namespace kernels {
 
-struct BatchNormFunctorBase {
-  BatchNormFunctorBase(bool folded_constant,
+struct BatchNormFunctorBase : OpKernel {
+  BatchNormFunctorBase(OpKernelContext *context,
+                       bool folded_constant,
                        const ActivationType activation,
                        const float relux_max_limit)
-    : folded_constant_(folded_constant),
+    : OpKernel(context),
+      folded_constant_(folded_constant),
       activation_(activation),
       relux_max_limit_(relux_max_limit) {}
 
@@ -51,10 +53,14 @@ struct BatchNormFunctor;
 
 template<>
 struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
-  BatchNormFunctor(const bool folded_constant,
+  BatchNormFunctor(OpKernelContext *context,
+                   const bool folded_constant,
                    const ActivationType activation,
                    const float relux_max_limit)
-    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(context,
+                             folded_constant,
+                             activation,
+                             relux_max_limit) {}
 
   MaceStatus operator()(const Tensor *input,
                   const Tensor *scale,
@@ -132,10 +138,14 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
-  BatchNormFunctor(const bool folded_constant,
+  BatchNormFunctor(OpKernelContext *context,
+                   const bool folded_constant,
                    const ActivationType activation,
                    const float relux_max_limit)
-    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(context,
+                             folded_constant,
+                             activation,
+                             relux_max_limit) {}
   MaceStatus operator()(const Tensor *input,
                   const Tensor *scale,
                   const Tensor *offset,
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index 1cd8421ccbf1c38a7b6cb86255cb1b652714b938..e2ea8ccfb88308e6fcfa3de731e6905fe970b13b 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -21,6 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -30,10 +31,10 @@
 namespace mace {
 namespace kernels {
 
-struct BiasAddFunctorBase {
-  explicit BiasAddFunctorBase(const DataFormat data_format) {
-    data_format_ = data_format;
-  }
+struct BiasAddFunctorBase : OpKernel {
+  BiasAddFunctorBase(OpKernelContext *context,
+                     const DataFormat data_format)
+      : OpKernel(context), data_format_(data_format) {}
 
   DataFormat data_format_;
 };
@@ -43,8 +44,9 @@ struct BiasAddFunctor;
 
 template <>
 struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
-  explicit BiasAddFunctor(const DataFormat data_format)
-      : BiasAddFunctorBase(data_format) {}
+  BiasAddFunctor(OpKernelContext *context,
+                 const DataFormat data_format)
+      : BiasAddFunctorBase(context, data_format) {}
 
   MaceStatus operator()(const Tensor *input,
                         const Tensor *bias,
@@ -96,8 +98,8 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
-  explicit BiasAddFunctor(const DataFormat data_format)
-      : BiasAddFunctorBase(data_format) {}
+  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
+      : BiasAddFunctorBase(context, data_format) {}
   MaceStatus operator()(const Tensor *input,
                         const Tensor *bias,
                         Tensor *output,
diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h
index 1def908705686085ac5e5a73f9e022e6f4df27e1..4a2f731b0e49baee5db257998e1d82c665a0aee2 100644
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -20,21 +20,24 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/opencl/common.h"
 
 namespace mace {
 namespace kernels {
 
-struct BufferToImageFunctorBase {
-  explicit BufferToImageFunctorBase(const int wino_blk_size)
-    : wino_blk_size_(wino_blk_size) {}
+struct BufferToImageFunctorBase : OpKernel {
+  explicit BufferToImageFunctorBase(OpKernelContext *context,
+                                    const int wino_blk_size)
+    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
   const int wino_blk_size_;
 };
 
 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(const int wino_blk_size)
-    : BufferToImageFunctorBase(wino_blk_size) {}
+  explicit BufferToImageFunctor(OpKernelContext *context,
+                                const int wino_blk_size)
+      : BufferToImageFunctorBase(context, wino_blk_size) {}
   MaceStatus operator()(const Tensor *input,
                   const BufferType type,
                   Tensor *output,
@@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
 
 template <typename T>
 struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(const int wino_blk_size)
-      : BufferToImageFunctorBase(wino_blk_size) {}
+  explicit BufferToImageFunctor(OpKernelContext *context,
+                                const int wino_blk_size)
+      : BufferToImageFunctorBase(context, wino_blk_size) {}
   MaceStatus operator()(const Tensor *input,
                   const BufferType type,
                   Tensor *output,
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index 920e1e1a2bf96bcc7d55009244bda693050f3780..029eb1c66b665baed39cacec05c9dbe9b45ca1b5 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -20,13 +20,15 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 
 namespace mace {
 namespace kernels {
 
 template<DeviceType D, typename T>
-struct ChannelShuffleFunctor {
-  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+struct ChannelShuffleFunctor : OpKernel {
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
+      : OpKernel(context), groups_(groups) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
@@ -70,8 +72,9 @@ struct ChannelShuffleFunctor {
 
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct ChannelShuffleFunctor<DeviceType::GPU, T> {
-  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
+      : OpKernel(context), groups_(groups) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 1728ca088bb8c0161191eba6cf5eef5459bb7139..696d4ff034c852ff9fdfd2f38b0682fc8b2dfe50 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -30,15 +31,17 @@
 namespace mace {
 namespace kernels {
 
-struct ConcatFunctorBase {
-  explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
+struct ConcatFunctorBase : OpKernel {
+  ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
+      : OpKernel(context), axis_(axis) {}
 
   int32_t axis_;
 };
 
 template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
-  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  ConcatFunctor(OpKernelContext *context, const int32_t axis)
+      : ConcatFunctorBase(context, axis) {}
 
   MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
@@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
-  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  ConcatFunctor(OpKernelContext *context, const int32_t axis)
+      : ConcatFunctorBase(context, axis) {}
 
   MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 282472bca3829cd112a0be0d62a5409fdb3bbfc5..ce9bb11d24807261e556ad5b6e4c6347d1ba6eab 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -42,14 +42,16 @@
 namespace mace {
 namespace kernels {
 
-struct Conv2dFunctorBase {
-  Conv2dFunctorBase(const int *strides,
+struct Conv2dFunctorBase : OpKernel {
+  Conv2dFunctorBase(OpKernelContext *context,
+                    const int *strides,
                     const Padding &padding_type,
                     const std::vector<int> &paddings,
                     const int *dilations,
                     const ActivationType activation,
                     const float relux_max_limit)
-    : strides_(strides),
+    : OpKernel(context),
+      strides_(strides),
       padding_type_(padding_type),
       paddings_(paddings),
       dilations_(dilations),
@@ -69,7 +71,8 @@ struct Conv2dFunctor;
 
 template<>
 struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                 const Padding &padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations,
@@ -77,12 +80,14 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                 const float relux_max_limit,
                 const bool is_filter_transformed,
                 ScratchBuffer *scratch)
-    : Conv2dFunctorBase(strides,
+    : Conv2dFunctorBase(context,
+                        strides,
                         padding_type,
                         paddings,
                         dilations,
                         activation,
                         relux_max_limit),
+      transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
       is_filter_transformed_(is_filter_transformed),
       scratch_(scratch) {}
 
@@ -721,7 +726,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 
 template<>
 struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                 const Padding &padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations,
@@ -729,7 +735,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
                 const float relux_max_limit,
                 const bool is_filter_transformed,
                 ScratchBuffer *scratch)
-      : Conv2dFunctorBase(strides,
+      : Conv2dFunctorBase(context,
+                          strides,
                           padding_type,
                           paddings,
                           dilations,
@@ -949,7 +956,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                 const Padding &padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations,
@@ -957,7 +965,8 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                 const float relux_max_limit,
                 const bool is_filter_transformed,
                 ScratchBuffer *scratch)
-    : Conv2dFunctorBase(strides,
+    : Conv2dFunctorBase(context,
+                        strides,
                         padding_type,
                         paddings,
                         dilations,
@@ -968,10 +977,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
   }
 
   MaceStatus operator()(const Tensor *input,
-                  const Tensor *filter,
-                  const Tensor *bias,
-                  Tensor *output,
-                  StatsFuture *future);
+                        const Tensor *filter,
+                        const Tensor *bias,
+                        Tensor *output,
+                        StatsFuture *future);
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
diff --git a/mace/kernels/crop.h b/mace/kernels/crop.h
index 241584e849906a49a979002871e431c82c6503ed..6ad9650ee406d13a8ca2b64b41fadd81ce462ca6 100644
--- a/mace/kernels/crop.h
+++ b/mace/kernels/crop.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -30,10 +31,12 @@
 namespace mace {
 namespace kernels {
 
-struct CropFunctorBase {
-  CropFunctorBase(const int axis,
+struct CropFunctorBase : OpKernel {
+  CropFunctorBase(OpKernelContext *context,
+                  const int axis,
                   const std::vector<int> &offset)
-      : axis_(axis),
+      : OpKernel(context),
+        axis_(axis),
         offset_(offset) {}
 
   const int axis_;
@@ -42,8 +45,10 @@ struct CropFunctorBase {
 
 template <DeviceType D, typename T>
 struct CropFunctor : CropFunctorBase {
-  CropFunctor(const int axis, const std::vector<int> &offset)
-      : CropFunctorBase(axis, offset) {}
+  CropFunctor(OpKernelContext *context,
+              const int axis,
+              const std::vector<int> &offset)
+      : CropFunctorBase(context, axis, offset) {}
 
   void crop_copy(const T* input_data, T* output_data,
                  const std::vector<index_t> &input_shape,
@@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
-  CropFunctor(const int axis, const std::vector<int> &offset)
-  : CropFunctorBase(axis, offset) {}
+  CropFunctor(OpKernelContext *context,
+              const int axis,
+              const std::vector<int> &offset)
+      : CropFunctorBase(context, axis, offset) {}
 
   MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output,
-                  StatsFuture *future);
+                        Tensor *output,
+                        StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
   std::unique_ptr<BufferBase> kernel_error_;
diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index 9450104d5abaf7f99a1d97c1d6fff11505562252..4bfc4d613a1454624e1a373f677ebd4df29c1db9 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input,
 }
 }  // namespace deconv
 
-struct Deconv2dFunctorBase {
-  Deconv2dFunctorBase(const std::vector<int> &strides,
+struct Deconv2dFunctorBase : OpKernel {
+  Deconv2dFunctorBase(OpKernelContext *context,
+                      const std::vector<int> &strides,
                       const Padding &padding_type,
                       const std::vector<int> &paddings,
                       const std::vector<index_t> &output_shape,
                       const ActivationType activation,
                       const float relux_max_limit)
-      : strides_(strides),
+      : OpKernel(context),
+        strides_(strides),
         padding_type_(padding_type),
         paddings_(paddings),
         output_shape_(output_shape),
@@ -210,13 +212,15 @@ struct Deconv2dFunctorBase {
 
 template <DeviceType D, typename T>
 struct Deconv2dFunctor : Deconv2dFunctorBase {
-  Deconv2dFunctor(const std::vector<int> &strides,
+  Deconv2dFunctor(OpKernelContext *context,
+                  const std::vector<int> &strides,
                   const Padding &padding_type,
                   const std::vector<int> &paddings,
                   const std::vector<index_t> &output_shape,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : Deconv2dFunctorBase(strides,
+      : Deconv2dFunctorBase(context,
+                            strides,
                             padding_type,
                             paddings,
                             output_shape,
@@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
-  Deconv2dFunctor(const std::vector<int> &strides,
+  Deconv2dFunctor(OpKernelContext *context,
+                  const std::vector<int> &strides,
                   const Padding &padding_type,
                   const std::vector<int> &paddings,
                   const std::vector<index_t> &output_shape,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : Deconv2dFunctorBase(strides,
+      : Deconv2dFunctorBase(context,
+                            strides,
                             padding_type,
                             paddings,
                             output_shape,
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index c0e0f2670fc2970d4d29cdb9ae4680e77a607a8d..7c4a7456a122b7028360ab560117ed7bce0e9a0a 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -19,6 +19,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -29,9 +30,11 @@ namespace mace {
 namespace kernels {
 
 template<DeviceType D, typename T>
-struct DepthToSpaceOpFunctor {
-  explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
-      : block_size_(block_size), d2s_(d2s) {}
+struct DepthToSpaceOpFunctor : OpKernel {
+  DepthToSpaceOpFunctor(OpKernelContext *context,
+                        const int block_size,
+                        bool d2s)
+      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
                         StatsFuture *future) {
@@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor {
 
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
-  DepthToSpaceOpFunctor(const int block_size, bool d2s)
-      : block_size_(block_size), d2s_(d2s) {}
+struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
+  DepthToSpaceOpFunctor(OpKernelContext *context,
+                        const int block_size,
+                        bool d2s)
+      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
                         StatsFuture *future);
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index 9304b14f711f184616d42228cca0713b487f7511..3b2eb70bc78d26e586fba777945e4a674250df10 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -37,14 +37,16 @@
 namespace mace {
 namespace kernels {
 
-struct DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctorBase(const int *strides,
+struct DepthwiseConv2dFunctorBase : OpKernel {
+  DepthwiseConv2dFunctorBase(OpKernelContext *context,
+                             const int *strides,
                              const Padding padding_type,
                              const std::vector<int> &paddings,
                              const int *dilations,
                              const ActivationType activation,
                              const float relux_max_limit)
-    : strides_(strides),
+    : OpKernel(context),
+      strides_(strides),
       padding_type_(padding_type),
       paddings_(paddings),
       dilations_(dilations),
@@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor;
 template<>
 struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
   : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                          const Padding padding_type,
                          const std::vector<int> &paddings,
                          const int *dilations,
                          const ActivationType activation,
                          const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(strides,
+    : DepthwiseConv2dFunctorBase(context,
+                                 strides,
                                  padding_type,
                                  paddings,
                                  dilations,
@@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
 template<>
 struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
     : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                          const Padding padding_type,
                          const std::vector<int> &paddings,
                          const int *dilations,
                          const ActivationType activation,
                          const float relux_max_limit)
-      : DepthwiseConv2dFunctorBase(strides,
+      : DepthwiseConv2dFunctorBase(context,
+                                   strides,
                                    padding_type,
                                    paddings,
                                    dilations,
@@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
       const int32_t *bias_data = nullptr;
       if (bias == nullptr) {
         zero_bias.reset(
-            new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
+            new Tensor(GetCPUAllocator(), DT_INT32));
         zero_bias->Resize(bias_shape);
         zero_bias->Clear();
         bias_data = zero_bias->data<int32_t>();
@@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
 template<typename T>
 struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
   : DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                          const Padding padding_type,
                          const std::vector<int> &paddings,
                          const int *dilations,
                          const ActivationType activation,
                          const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(strides,
+    : DepthwiseConv2dFunctorBase(context,
+                                 strides,
                                  padding_type,
                                  paddings,
                                  dilations,
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 42d220fa2dd5a7c6bbd39052dd6a99960d24cda5..9e9a2be8f9e17cde08665e29c5debce4275e4eb2 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -23,6 +23,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
   }
 }
 
-struct EltwiseFunctorBase {
-  EltwiseFunctorBase(const EltwiseType type,
+struct EltwiseFunctorBase : OpKernel {
+  EltwiseFunctorBase(OpKernelContext *context,
+                     const EltwiseType type,
                      const std::vector<float> &coeff,
                      const float scalar_input,
                      const int32_t scalar_input_index,
                      const DataFormat data_format)
-      : type_(type),
+      : OpKernel(context),
+        type_(type),
         coeff_(coeff),
         scalar_input_(scalar_input),
         scalar_input_index_(scalar_input_index),
@@ -823,12 +826,14 @@ struct EltwiseFunctorBase {
 
 template <DeviceType D, typename T>
 struct EltwiseFunctor : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(OpKernelContext *context,
+                 const EltwiseType type,
                  const std::vector<float> &coeff,
                  const float scalar_input,  // float as it comes from arg
                  const int32_t scalar_input_index,
                  const DataFormat data_format)
-      : EltwiseFunctorBase(type,
+      : EltwiseFunctorBase(context,
+                           type,
                            coeff,
                            scalar_input,
                            scalar_input_index,
@@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(OpKernelContext *context,
+                 const EltwiseType type,
                  const std::vector<float> &coeff,
                  const float scalar_input,
                  const int32_t scalar_input_index,
                  const DataFormat data_format)
-      : EltwiseFunctorBase(type,
+      : EltwiseFunctorBase(context,
+                           type,
                            coeff,
                            scalar_input,
                            scalar_input_index,
diff --git a/mace/kernels/fill.h b/mace/kernels/fill.h
index b534a1839c77d183441e9cff74c1de6a917fa648..131dd9d4bffc8f851dd22e1f1a1603defc3d5bb2 100644
--- a/mace/kernels/fill.h
+++ b/mace/kernels/fill.h
@@ -21,6 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 namespace mace {
@@ -30,8 +31,8 @@ template <DeviceType D, class T>
 struct FillFunctor;
 
 template <>
-struct FillFunctor<DeviceType::CPU, float> {
-  FillFunctor() {}
+struct FillFunctor<DeviceType::CPU, float> : OpKernel {
+  explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {}
 
   MaceStatus operator()(const Tensor *shape,
                         const Tensor *value,
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index e5172920a2a3c08862948257debff0387da10a0c..e6743aa4475777310d9c93183bef8031fd1931c8 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -27,10 +27,12 @@
 namespace mace {
 namespace kernels {
 
-struct FullyConnectedBase {
-  FullyConnectedBase(const ActivationType activation,
+struct FullyConnectedBase : OpKernel {
+  FullyConnectedBase(OpKernelContext *context,
+                     const ActivationType activation,
                      const float relux_max_limit)
-      : activation_(activation),
+      : OpKernel(context),
+        activation_(activation),
         relux_max_limit_(relux_max_limit) {}
 
   const ActivationType activation_;
@@ -42,9 +44,10 @@ struct FullyConnectedFunctor;
 
 template <>
 struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                         const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}
 
   MaceStatus operator()(const Tensor *input,
                         const Tensor *weight,
@@ -86,9 +89,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
 
 template <>
 struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                         const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}
 
   MaceStatus operator()(const Tensor *input,
                         const Tensor *weight,
@@ -117,7 +121,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
     const int32_t *bias_ptr = nullptr;
     if (bias == nullptr) {
       zero_bias.reset(
-          new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
+          new Tensor(GetCPUAllocator(), DT_INT32));
       zero_bias->Resize(bias_shape);
       zero_bias->Clear();
       bias_ptr = zero_bias->data<int32_t>();
@@ -148,9 +152,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                         const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}
 
   MaceStatus operator()(const Tensor *input,
                   const Tensor *weight,
diff --git a/mace/kernels/gather.h b/mace/kernels/gather.h
index 101a60e3f9913c3ab14f7ba4e952390e832e7768..ddfa14d1d9b978e130162ee58d533810ae18cfec 100644
--- a/mace/kernels/gather.h
+++ b/mace/kernels/gather.h
@@ -21,13 +21,15 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace kernels {
 
-struct GatherBase {
-  explicit GatherBase(int axis, float y) : axis_(axis), y_(y) {}
+struct GatherBase : OpKernel {
+  GatherBase(OpKernelContext *context, int axis, float y)
+      : OpKernel(context), axis_(axis), y_(y) {}
 
   int axis_;
   float y_;
@@ -38,7 +40,8 @@ struct GatherFunctor;
 
 template <>
 struct GatherFunctor<DeviceType::CPU, float> : GatherBase {
-  explicit GatherFunctor(int axis, float y) : GatherBase(axis, y) {}
+  GatherFunctor(OpKernelContext *context, int axis, float y)
+      : GatherBase(context, axis, y) {}
 
   MaceStatus operator()(const Tensor *params,
                         const Tensor *indices,
diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc
index c94c0af5e900e7414d452e35630b8c6f623418b7..5043a1041fea0fffeb0661651244b7b8c1899771 100644
--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -1341,8 +1341,8 @@ void Gemm(const float *A,
               ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k);
           const index_t ik_end = std::min(K, ik_begin + this_block_size_k);
 
-          Tensor trans_a;
-          Tensor trans_b;
+          Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
+          Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
           const float *real_a = nullptr;
           const float *real_b = nullptr;
           float *real_c = c_base + (ih_begin * width + iw_begin);
@@ -1399,8 +1399,8 @@ void GemmRef(const float *A,
              const bool transpose_b) {
   memset(C, 0, sizeof(float) * batch * height * width);
 
-  Tensor trans_a;
-  Tensor trans_b;
+  Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
   float *trans_a_data = nullptr;
   float *trans_b_data = nullptr;
   if (transpose_a) {
diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h
index 4e6b057f78520bbb05b18599482ff04a24e407c2..c4394fda15e95c2c65af625ed0e711af4391be6b 100644
--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -20,21 +20,24 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/opencl/common.h"
 
 namespace mace {
 namespace kernels {
 
-struct ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctorBase(const int wino_blk_size)
-    : wino_blk_size_(wino_blk_size) {}
+struct ImageToBufferFunctorBase : OpKernel {
+  ImageToBufferFunctorBase(OpKernelContext *context,
+                           const int wino_blk_size)
+    : OpKernel(context),
+      wino_blk_size_(wino_blk_size) {}
   const int wino_blk_size_;
 };
 
 template <DeviceType D, typename T>
 struct ImageToBufferFunctor : ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctor(const int wino_blk_size)
-    : ImageToBufferFunctorBase(wino_blk_size) {}
+  ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
+    : ImageToBufferFunctorBase(context, wino_blk_size) {}
   MaceStatus operator()(const Tensor *input,
                         const BufferType type,
                         Tensor *output,
@@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
 
 template <typename T>
 struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctor(const int wino_blk_size)
-    : ImageToBufferFunctorBase(wino_blk_size) {}
+  ImageToBufferFunctor(OpKernelContext *context,
+                                const int wino_blk_size)
+      : ImageToBufferFunctorBase(context, wino_blk_size) {}
   MaceStatus operator()(const Tensor *input,
                         const BufferType type,
                         Tensor *output,
diff --git a/mace/kernels/kernel.h b/mace/kernels/kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..853e974f76a5667c326c85346bfd3ba274b2cd9f
--- /dev/null
+++ b/mace/kernels/kernel.h
@@ -0,0 +1,31 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_KERNEL_H_
+#define MACE_KERNELS_KERNEL_H_
+
+#include "mace/core/op_kernel_context.h"
+
+namespace mace {
+namespace kernels {
+
+struct OpKernel {
+  explicit OpKernel(OpKernelContext *context): context_(context) {}
+
+  OpKernelContext *context_;
+};
+
+}  // namespace kernels
+}  // namespace mace
+#endif  //  MACE_KERNELS_KERNEL_H_
diff --git a/mace/kernels/local_response_norm.h b/mace/kernels/local_response_norm.h
index 0af86327abc120b7a31348c9a3f393437466faf4..d9eeb7db43d8d8864f3f4a5a2d708fc600adfcab 100644
--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -21,7 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/public/mace.h"
+#include "mace/kernels/kernel.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -34,7 +34,9 @@ template<DeviceType D, typename T>
 struct LocalResponseNormFunctor;
 
 template<>
-struct LocalResponseNormFunctor<DeviceType::CPU, float> {
+struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
+  explicit LocalResponseNormFunctor(OpKernelContext *context)
+      : OpKernel(context) {}
   MaceStatus operator()(const Tensor *input,
                   int depth_radius,
                   float bias,
diff --git a/mace/kernels/lstmcell.h b/mace/kernels/lstmcell.h
index 46439fae1f269abf21f53fe3ac75ba67df6406be..cb6b86fdd2959067b9d5c53bc69cdb325b286d2e 100644
--- a/mace/kernels/lstmcell.h
+++ b/mace/kernels/lstmcell.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -35,9 +36,10 @@ template <DeviceType D, typename T>
 struct LSTMCellFunctor;
 
 template <typename T>
-struct LSTMCellFunctor<DeviceType::GPU, T> {
-  explicit LSTMCellFunctor(T forget_bias) :
-    forget_bias_(static_cast<T>(forget_bias)) {}
+struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
+  LSTMCellFunctor(OpKernelContext *context, T forget_bias)
+      : OpKernel(context),
+        forget_bias_(static_cast<T>(forget_bias)) {}
   MaceStatus operator()(const Tensor *input,
                         const Tensor *pre_output,
                         const Tensor *weight,
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 42e76002a231d3b0b5ebc38d3df0bacf0cc265a0..4b6c5cf1ef8309281178fd52f545556b87a80190 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -29,6 +29,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/gemm.h"
+#include "mace/kernels/kernel.h"
 #include "mace/utils/utils.h"
 #include "mace/kernels/gemmlowp_util.h"
 
@@ -40,7 +41,8 @@ namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct MatMulFunctor {
+struct MatMulFunctor : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const Tensor *A,
                         const Tensor *B,
                         Tensor *C,
@@ -87,7 +89,7 @@ struct MatMulFunctor {
       // A * B = (B^T * A^T)^T
       if (!transpose_b) {
         if (B_transpose_.get() == nullptr) {
-          B_transpose_.reset(new Tensor(GetDeviceAllocator(D),
+          B_transpose_.reset(new Tensor(context_->device()->allocator(),
                                         DataTypeToEnum<T>::v()));
           B_transpose_->Resize({batch, width, K});
           Tensor::MappingGuard guardbt(B_transpose_.get());
@@ -112,7 +114,8 @@ struct MatMulFunctor {
 };
 
 template <>
-struct MatMulFunctor<CPU, uint8_t> {
+struct MatMulFunctor<CPU, uint8_t> : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
   template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
   void MatMulImpl(const Tensor *A,
                   const Tensor *B,
@@ -208,7 +211,8 @@ struct MatMulFunctor<CPU, uint8_t> {
 
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct MatMulFunctor<DeviceType::GPU, T> {
+struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const Tensor *A,
                         const Tensor *B,
                         Tensor *C,
diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc
index 2cd0c2a3868357946ccb73979fb0e4b4c1391a06..7757758c379b82ccfc8238da9960d46eed50380a 100644
--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -33,11 +33,11 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
     built_options.emplace("-Dactivation=" + kernel_name);
@@ -94,12 +94,12 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
-                                           lws, future));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index f01baa7170dbbf3c907ac38ab6d45bd600e50a31..7c1c1afc669b3de85055aac01ea9f96d9cf007ec 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -34,7 +34,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
   const index_t width = input_tensors[0]->dim(2);
   const index_t channels = input_tensors[0]->dim(3);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   for (size_t i = 1; i < size; ++i) {
     MACE_CHECK_NOTNULL(input_tensors[i]);
@@ -49,7 +49,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
       MACE_NOT_IMPLEMENTED;
     }
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
@@ -96,7 +96,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
              output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(kernel_error_);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc
index e26065d9d340022455b585e55c953aab8c307e5c..446a26cc034bc9536d2495fc89c91d8174804f06 100644
--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
@@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(kernel_error_);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc
index aaa0d17203c40dbd177e5a42956b8d9d3078c9f2..eae22c0074c8205c69fc7741274b09700e94d6f1 100644
--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
     built_options.emplace("-Dbias_add=" + kernel_name);
@@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
 
   cl::Event event;
   cl_int error;
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index c95ef0ade2789f880cb563ee2d0103c7de4abf6f..75d0c4f542a11feda4e615ff025d0d771931008b 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
     }
   }
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::stringstream kernel_name_ss;
     kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index d74346832d9ff41af251decbdd0e113a408c9f62..64de09c2d597b5fe5bd2bf4923c442426c43ce8c 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
     built_options.emplace("-Dchannel_shuffle=" + kernel_name);
@@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(kernel_error_);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 58b27faa92ff1a3979249cbe9a552ff4f58323c6..6fa4ba8fedf2abc4365a2b83b16c0d90a51aeae0 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -22,13 +22,15 @@ namespace mace {
 namespace kernels {
 
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
     uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
     lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
@@ -41,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 
 }  // namespace
 
-static MaceStatus Concat2(cl::Kernel *kernel,
+static MaceStatus Concat2(OpKernelContext *context,
+                          cl::Kernel *kernel,
                           const Tensor *input0,
                           const Tensor *input1,
                           const DataType dt,
@@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel,
       static_cast<uint32_t>(batch * height),
   };
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
     built_options.emplace("-Dconcat_channel=" + kernel_name);
@@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel,
     *prev_input_shape = input0->shape();
   }
 
-  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
   std::string tuning_key =
       Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(*kernel_error);
   return MACE_SUCCESS;
 }
 
-static MaceStatus ConcatN(cl::Kernel *kernel,
+static MaceStatus ConcatN(OpKernelContext *context,
+                          cl::Kernel *kernel,
                           const std::vector<const Tensor *> &input_list,
                           const DataType dt,
                           Tensor *output,
@@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
     built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
@@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
         static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
         static_cast<uint32_t>(batch * height),
     };
-    const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+    const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
 
     uint32_t idx = 0;
     OUT_OF_RANGE_SET_ARG_PTR;
@@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
       for (size_t j = 0; j < 3; ++j) {
         roundup_gws[j] = RoundUp(gws[j], lws[j]);
       }
-      const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
-
       error = runtime->command_queue().enqueueNDRangeKernel(
           *kernel, cl::NullRange,
           cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
@@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
     }
   }
   if (future != nullptr) {
-    future->wait_fn = [runtime, call_stats](CallStats *stats) {
+    future->wait_fn = [call_stats](CallStats *stats) {
       if (stats != nullptr) {
         stats->start_micros = call_stats.start_micros;
         stats->end_micros = stats->start_micros + call_stats.end_micros;
@@ -234,12 +236,14 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
 
   switch (inputs_count) {
     case 2:
-      return Concat2(&kernel_, input_list[0], input_list[1],
+      return Concat2(context_,
+                     &kernel_, input_list[0], input_list[1],
                      DataTypeToEnum<T>::value, &input_shape_, output, future,
                      &kwg_size_, &kernel_error_);
     default:
       if (divisible_four) {
-        return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
+        return ConcatN(context_,
+                       &kernel_, input_list, DataTypeToEnum<T>::value, output,
                        future, &kwg_size_, &kernel_error_);
       } else {
         MACE_NOT_IMPLEMENTED;
diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc
index 6221382e7e8e9d9290777379d7d77832b17b8e40..bc8538b77e9f9de56a6e51cdbdbcd905ff8f2a50 100644
--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
@@ -18,7 +18,8 @@
 namespace mace {
 namespace kernels {
 
-extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime,
+                                   cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
                                    const Tensor *bias,
@@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
                                    uint32_t *kwg_size,
                                    std::unique_ptr<BufferBase> *kernel_error);
 
-extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime,
+                                   cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
                                    const Tensor *bias,
@@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
                                    uint32_t *kwg_size,
                                    std::unique_ptr<BufferBase> *kernel_error);
 
-extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpencl(OpKernelContext *runtime,
+                               cl::Kernel *kernel,
                                const Tensor *input,
                                const Tensor *filter,
                                const Tensor *bias,
@@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          Tensor *output,
                                                          StatsFuture *future) {
   typedef MaceStatus (*Conv2dOpenclFunction)(
-      cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
-      const Tensor *bias, const int stride, const int *padding,
-      const int *dilations, const ActivationType activation,
+      OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input,
+      const Tensor *filter, const Tensor *bias, const int stride,
+      const int *padding, const int *dilations,
+      const ActivationType activation,
       const float relux_max_limit, const DataType dt,
       std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
       uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
@@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   if (kernel_h == kernel_w && kernel_h <= 3 &&
       selector[kernel_h - 1] != nullptr) {
     auto conv2d_func = selector[kernel_h - 1];
-    return conv2d_func(
+    return conv2d_func(context_,
         &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
         activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
         output, future, &kwg_size_, &kernel_error_);
   } else {
-    return Conv2dOpencl(
+    return Conv2dOpencl(context_,
         &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
         activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
         output, future, &kwg_size_, &kernel_error_);
diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc
index 770f0606d4152c6ad7e65f92c94246487571417a..c43c045019ba1a2e9cde11cd9288159f36ed45ec 100644
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -25,14 +25,16 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 // TODO(liuqi): Fix the specific value.
 const uint32_t lws_limit = 128;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t compute_units = runtime->device_compute_units();
     const uint32_t base =
         std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
@@ -62,7 +64,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 
 }  // namespace
 
-extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
+                                   cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
                                    const Tensor *bias,
@@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
   const index_t width_blocks = RoundUpDiv4(width);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     MACE_CHECK(input_batch == batch);
 
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
     built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
@@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
   std::string tuning_key =
       Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(*kernel_error);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc
index 02df4ea166abd8f80bd77e9cc6c91754a30503e8..c0362831658ccd327ec0407bdc9f4ff05d40cf1c 100644
--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -24,15 +24,17 @@ namespace kernels {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
     uint32_t compute_units = std::max<uint32_t>(
-        OpenCLRuntime::Global()->device_compute_units() / 2, 1);
+        runtime->device_compute_units() / 2, 1);
     const uint32_t base =
         std::max<uint32_t>(
             std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
@@ -55,7 +57,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 
 }  // namespace
 
-extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
+                                   cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
                                    const Tensor *bias,
@@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
     built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
@@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
   std::string tuning_key =
       Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(*kernel_error);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc
index fa2c9774b607652c3ca307239d5baae33aeac699..bac1da8f40e0c8ad2a75d328730ee9f0f495319b 100644
--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -26,7 +26,8 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 // TODO(liuqi): Fix the specific value.
 const uint32_t lws_limit = 20;
-std::vector<uint32_t> LocalWS(const uint32_t *gws,
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
                               const uint32_t kernel_size,
                               const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
@@ -34,8 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t compute_units = runtime->device_compute_units();
     const uint32_t base =
         std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
@@ -64,7 +65,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
 
 }  // namespace
 
-extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpencl(OpKernelContext *context,
+                               cl::Kernel *kernel,
                                const Tensor *input,
                                const Tensor *filter,
                                const Tensor *bias,
@@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv4(width);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
     built_options.emplace("-Dconv_2d=" + kernel_name);
@@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
       Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
   std::vector<uint32_t> lws =
-      LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+      LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(*kernel_error);
diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc
index 651b2ef87a544ca6f682aedb3e8a2c1ae3bd4bf1..fce91d2be483d62570fec85bd91515f8bb89e8d5 100644
--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
@@ -22,13 +22,15 @@ namespace mace {
 namespace kernels {
 
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
     uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
     lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
@@ -132,11 +134,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
       static_cast<uint32_t>(output->dim(0) * output->dim(1))
   };
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
     built_options.emplace("-Dcrop=" + kernel_name);
@@ -167,11 +169,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input0->shape();
   }
 
-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(kernel_error_);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/deconv_2d.cc b/mace/kernels/opencl/deconv_2d.cc
index cba8cbceaeb5fafdd7410250b0b36c6238706479..197b305e7e80b10d121c883d417c59a71d2abd9e 100644
--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
@@ -20,7 +20,8 @@ namespace kernels {
 
 namespace {
 
-MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
+MaceStatus Deconv2dOpencl(OpKernelContext *context,
+                          cl::Kernel *kernel,
                           const Tensor *input,
                           const Tensor *filter,
                           const Tensor *bias,
@@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
   const int align_w = stride_w - 1 - padding_w;
   const int kernel_size = filter->dim(2) * filter->dim(3);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
     built_options.emplace("-Ddeconv_2d=" + kernel_name);
@@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, *kwg_size);
   std::string tuning_key =
       Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
                   &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(),
-                        paddings.data(), activation_, relux_max_limit_,
-                        DataTypeToEnum<T>::value, &input_shape_, output, future,
+  return Deconv2dOpencl(context_, &kernel_, input, filter, bias,
+                        strides_.data(), paddings.data(), activation_,
+                        relux_max_limit_, DataTypeToEnum<T>::value,
+                        &input_shape_, output, future,
                         &kwg_size_, &kernel_error_);
 }
 
diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc
index 4c1fd3becb1ada46dee96afe50ff56ff728ba0e9..f5427af18d5b37887bb2991f0f51b2731c6e7eff 100644
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::stringstream kernel_name_ss;
@@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc
index 3c97a28845ae09152e6439092db8b78e4f992275..1bc910fdabc5551ff48e431f193ba42346830759 100644
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -24,13 +24,15 @@ namespace kernels {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
     uint32_t base = cache_size / kBaseGPUMemCacheSize;
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
     if (lws[1] >= base) {
@@ -58,7 +60,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 
 }  // namespace
 
-static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
+static MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                                  cl::Kernel *kernel,
                                   const Tensor *input,   // NHWC
                                   const Tensor *filter,  // HWIM
                                   const Tensor *bias,
@@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
     if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
   std::string tuning_key =
       Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
   index_t kernel_h = filter->dim(2);
   index_t kernel_w = filter->dim(3);
   if (strides_[0] != strides_[1]) {
-    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides_[0] << "x" << strides_[1]
-                 << " is not implemented yet, using slow version";
-    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
-    return DepthwiseConv2dFunctor<DeviceType::CPU, float>(
-        strides_, padding_type_, paddings_, dilations_, activation_,
-        relux_max_limit_)(input, filter, bias, output, future);
+    LOG(FATAL) << "GPU depthwise conv2d kernel with "
+               << "filter" << kernel_h << "x" << kernel_w << ","
+               << " stride " << strides_[0] << "x" << strides_[1]
+               << " is not implemented yet.";
   }
 
   // Create a fake conv_2d filter to calculate the paddings and output size
@@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
   return DepthwiseConv2d(
+      context_,
       &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
       activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
       output, future, &kwg_size_, &kernel_error_);
diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc
index 9eedf011009ee8acbb97a00a84da5e140f11fa8c..201639e31bad24abc3c61053596b89d5fc7a25d7 100644
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(batch_height_pixels)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
     input_shape_ = input0->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(kernel_error_);
   return MACE_SUCCESS;
diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc
index dc8798a53c816b7a03153ae8dd2604d02dd7de67..2af592c761976cad0414d2b16288b09b99a48d49 100644
--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -22,7 +22,8 @@ namespace kernels {
 
 namespace {
 template <typename T>
-MaceStatus FCWXKernel(cl::Kernel *kernel,
+MaceStatus FCWXKernel(OpKernelContext *context,
+                      cl::Kernel *kernel,
                       const Tensor *input,
                       const Tensor *weight,
                       const Tensor *bias,
@@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
                       std::unique_ptr<BufferBase> *kernel_error) {
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   if (kernel->get() == nullptr) {
     const index_t batch = output->dim(0);
@@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
     const index_t output_blocks = RoundUpDiv4(output_size);
 
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
@@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
 }
 
 template <typename T>
-MaceStatus FCWTXKernel(cl::Kernel *kernel,
+MaceStatus FCWTXKernel(OpKernelContext *context,
+                       cl::Kernel *kernel,
                        const Tensor *input,
                        const Tensor *weight,
                        const Tensor *bias,
@@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
                        std::unique_ptr<BufferBase> *kernel_error) {
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
     NON_UNIFORM_WG_CONFIG;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
@@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
   std::string tuning_key =
       Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
                                            gws->data(), *lws, future));
 
   OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
                   &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
+  return FCWXKernel<T>(context_,
+                       &kernel_, input, weight, bias, &input_shape_, output,
                        activation_, &gws_, &lws_, relux_max_limit_, future,
                        &kernel_error_);
 }
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 6ef80c80d1e21b9e8a3c0e93b1721d50ccc46d00..aa3daadbd69f28975ec4ae75aba34ca78f595a69 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
   }
 }
 
-std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
+                                       const uint32_t *gws,
                                        const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
-    uint64_t cache_size =
-        OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint64_t cache_size = runtime->device_global_mem_cache_size();
     uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
     lws[2] =
@@ -245,13 +245,12 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
   return lws;
 }
 
-MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                                const std::string tuning_key,
                                const uint32_t *gws,
                                const std::vector<uint32_t> &lws,
                                StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
-
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
     }
     return error;
   };
-  OpenCLProfilingTimer timer(&event);
-  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  OpenCLProfilingTimer timer(runtime, &event);
+  cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
       tuning_key, lws, params_generator, func, &timer);
   MACE_CL_RET_STATUS(err);
 
   if (future != nullptr) {
-    future->wait_fn = [event](CallStats *stats) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
-        OpenCLRuntime::Global()->GetCallStats(event, stats);
+        runtime->GetCallStats(event, stats);
       }
     };
   }
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                                const std::string tuning_key,
                                const uint32_t *gws,
                                const std::vector<uint32_t> &lws,
                                StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
-
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
     }
     return error;
   };
-  OpenCLProfilingTimer timer(&event);
-  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  OpenCLProfilingTimer timer(runtime, &event);
+  cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
       tuning_key, lws, params_generator, func, &timer);
   MACE_CL_RET_STATUS(err);
 
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 5d4bf4104172ac093212fcb023941e9bb0015b6c..d9e309bc2c19045ffcd9eb4f373fb9dc7b208f61 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -31,11 +31,11 @@
 namespace mace {
 namespace kernels {
 
-#define OUT_OF_RANGE_CONFIG(kernel_error)                   \
+#define OUT_OF_RANGE_CONFIG(kernel_error, context)          \
   if (runtime->IsOutOfRangeCheckEnabled()) {                \
     built_options.emplace("-DOUT_OF_RANGE_CHECK");          \
     (kernel_error) = std::move(std::unique_ptr<Buffer>(     \
-        new Buffer(GetDeviceAllocator(DeviceType::GPU))));  \
+        new Buffer((context)->device()->allocator())));     \
     MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1));      \
     (kernel_error)->Map(nullptr);                           \
     *((kernel_error)->mutable_data<char>()) = 0;            \
@@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt);
 std::string DtToUpCompatibleCLDt(const DataType dt);
 
 // Tuning or Run OpenCL kernel with 3D work group size
-MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                                const std::string tuning_key,
                                const uint32_t *gws,
                                const std::vector<uint32_t> &lws,
                                StatsFuture *future);
 
 // Tuning or Run OpenCL kernel with 2D work group size
-MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                                const std::string tuning_key,
                                const uint32_t *gws,
                                const std::vector<uint32_t> &lws,
@@ -162,7 +164,8 @@ std::string Concat(Args... args) {
   return ss.str();
 }
 
-std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
+                                       const uint32_t *gws,
                                        const uint32_t kwg_size);
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc
index 955b9ebebd3fcb1d3bc48f04de7617e5b10e43cb..b98e9fb2ac77ee785ac21f17360fa998b37f537f 100644
--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
       break;
   }
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::stringstream kernel_name_ss;
     kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
diff --git a/mace/kernels/opencl/lstmcell.cc b/mace/kernels/opencl/lstmcell.cc
index ffc185d0dc84b2019c473827e8d02edc141e1482..6704c0b457d876c28a590860e6ad866ff24228ad 100644
--- a/mace/kernels/opencl/lstmcell.cc
+++ b/mace/kernels/opencl/lstmcell.cc
@@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
   const index_t width = input->dim(1);
   const index_t width_blocks = width / 4;
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
@@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
   const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
   std::string tuning_key =
       Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
   OUT_OF_RANGE_VALIDATION(kernel_error_);
 
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 4df9d58d74098bd6aae2e9697333090f05e215ba..407b455d106ad765f4731f98caa948527d1a2129 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -53,11 +53,11 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
       static_cast<uint32_t>(height_blocks * batch),
   };
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
@@ -84,7 +84,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
 
   const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
   std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc
index d257fea2d7fca9333c8d997e7703f53345feba2a..03f05ca5f5711edb327fd9a14e5de79a21eb074c 100644
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -16,6 +16,8 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "mace/core/op_kernel_context.h"
+#include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
@@ -25,14 +27,15 @@ namespace mace {
 namespace kernels {
 namespace {
 
-bool BufferToImageOpImpl(Tensor *buffer,
+bool BufferToImageOpImpl(OpKernelContext *context,
+                         Tensor *buffer,
                          Tensor *image,
                          const std::vector<size_t> &image_shape) {
   std::unique_ptr<BufferBase> kernel_error;
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                      static_cast<uint32_t>(image_shape[1])};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
 
   std::string kernel_name = "in_out_buffer_to_image";
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
@@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
   std::stringstream kernel_name_ss;
   kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
   built_options.emplace(kernel_name_ss.str());
-  OUT_OF_RANGE_CONFIG(kernel_error);
+  OUT_OF_RANGE_CONFIG(kernel_error, context);
   NON_UNIFORM_WG_CONFIG;
   if (buffer->dtype() == image->dtype()) {
     built_options.emplace("-DDATA_TYPE=" +
@@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) {
   index_t width = 7;
   index_t channels = 11;
 
-  std::vector<index_t> buffer_shape = {batch, height, width, channels};
+  GPUContext gpu_context;
+  std::unique_ptr<Device> device(new GPUDevice(gpu_context.opencl_tuner()));
+
   Workspace ws;
+  OpKernelContext context(&ws, device.get());
+
+  std::vector<index_t> buffer_shape = {batch, height, width, channels};
   Tensor *buffer =
-      ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
+      ws.CreateTensor("Buffer", device->allocator(),
                       DataTypeToEnum<float>::v());
   buffer->Resize(buffer_shape);
 
   std::vector<size_t> image_shape;
-  Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
+  Tensor *image = ws.CreateTensor("Image", device->allocator(),
                                   DataTypeToEnum<float>::v());
   CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
   image->ResizeImage(buffer->shape(), image_shape);
-  ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape));
+  ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape));
 
   std::vector<size_t> overflow_image_shape = image_shape;
   for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
     overflow_image_shape[i] += 1;
   }
-  ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape));
+  ASSERT_TRUE(BufferToImageOpImpl(&context,
+                                  buffer,
+                                  image,
+                                  overflow_image_shape));
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index 04e9d69d4aaf8f7a81f2deee644e80cdc4988145..a3f4cfaa53c7b21f95cfcbea219c4fe3853d6a72 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -47,11 +47,11 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
     built_options.emplace("-Dpad=" + kernel_name);
@@ -85,10 +85,10 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
                                   output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc
index 18eb6e80f9595ac177db70d49f9ac81b822bcbfc..c6743750a3b381e5fcbb632980df50340fa872d2 100644
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -23,13 +23,15 @@ namespace kernels {
 
 namespace {
 
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
     uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
     lws[2] =
@@ -54,12 +56,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
       << "Pooling opencl kernel not support dilation yet";
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     const DataType dt = DataTypeToEnum<T>::value;
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
     built_options.emplace("-Dpooling=" + kernel_name);
@@ -149,11 +151,11 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     };
   }
 
-  const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws.data(), kwg_size_);
   std::string tuning_key =
       Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws.data(), lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/reduce_mean.cc b/mace/kernels/opencl/reduce_mean.cc
index 075632c554323d591ab614c45d55717b9bcc44ad..a6a45f764a6e78628a98410ba20a423c58e0c6fd 100644
--- a/mace/kernels/opencl/reduce_mean.cc
+++ b/mace/kernels/opencl/reduce_mean.cc
@@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
   const index_t channel_blocks = RoundUpDiv4(channels);
   const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
   std::vector<uint32_t> gws(3);
   std::vector<uint32_t> lws(3);
   std::vector<index_t> output_shape{batch, 1, 1, channels};
@@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
   if (kernel_.get() == nullptr) {
     const DataType dt = DataTypeToEnum<T>::value;
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
     built_options.emplace("-Dreduce_mean=" + kernel_name);
diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc
index f8a33383e99a2db4978f40a9c17ce9034df30218..6fc26e52d8d7d1dc2a2e0e1229541fd2800dd358 100644
--- a/mace/kernels/opencl/resize_bicubic.cc
+++ b/mace/kernels/opencl/resize_bicubic.cc
@@ -23,9 +23,11 @@ namespace mace {
 namespace kernels {
 
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint64_t cache_size = runtime->device_global_mem_cache_size();
   uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   if (lws[1] >= base) {
@@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
+    auto dt = DataTypeToEnum<T>::value;
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
     built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
     built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
@@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
           Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
                  output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc
index 0b297dd22dae97f3be1fdf881a42118acf03169c..23e5db1c102979c9f3dbea869016d56ccc359d62 100644
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -23,13 +23,15 @@ namespace mace {
 namespace kernels {
 
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
     uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
     if (lws[1] >= base) {
@@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
     built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
@@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc
index f401b827096189156c184f348f0017ede7dce13f..e84ec7312d6d0e2e5cce33b1253b0ff948af19d5 100644
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -24,13 +24,15 @@ namespace kernels {
 
 namespace {
 
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   if (kwg_size == 0) {
     lws[0] = lws[1] = lws[2] = 1;
   } else {
     uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
     uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
     lws[1] = std::min<uint32_t>(gws[1], kwg_size);
     if (gws[0] < base) {
@@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
     built_options.emplace("-Dsoftmax=" + kernel_name);
@@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
     input_shape_ = logits->shape();
   }
 
-  std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc
index c31b2d691f5fccd72faa75f35ce88f034bd7900f..8794dd2a5ee2cefffaa8cec5b591501b3980c2a8 100644
--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
       chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
       static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::stringstream kernel_name_ss;
     kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
@@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
     space_shape_ = space_tensor->shape();
   }
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
              batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/opencl/split.cc b/mace/kernels/opencl/split.cc
index 65fd6be530898200e50cc74518813cd01e7c9d15..c445b783564095e5ef27ecfc486fbe79b0cc1548 100644
--- a/mace/kernels/opencl/split.cc
+++ b/mace/kernels/opencl/split.cc
@@ -40,11 +40,11 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
         output_list[i]->ResizeImage(output_shape, image_shape));
   }
 
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
     built_options.emplace("-Dsplit=" + kernel_name);
@@ -66,7 +66,7 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
       static_cast<uint32_t>(input->dim(0) * input->dim(1)),
   };
 
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   cl::Event event;
   CallStats call_stats{INT64_MAX, 0};
   for (size_t i = 0; i < outputs_count; ++i) {
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 74d8776fa089168c87ea7b1751244d3151e28492..43210171a743bdd7dd5640ccaf2415c23bacd553 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -24,12 +24,12 @@ namespace kernels {
 template <typename T>
 MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name;
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     if (wino_blk_size_ == 4) {
       obfuscated_kernel_name =
@@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
                                   output_tensor->dim(0),
                                   output_tensor->dim(1),
                                   output_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
@@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
     const std::vector<const Tensor*> &inputs,
     Tensor *output_tensor,
     StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
 
   const Tensor *input_tensor = inputs[0];
   const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
@@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name;
     std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
     NON_UNIFORM_WG_CONFIG;
     if (wino_blk_size_ == 4) {
       obfuscated_kernel_name =
@@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
       Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
              output_tensor->dim(1), output_tensor->dim(2),
              output_tensor->dim(3), input_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                            gws, lws, future));
 
   OUT_OF_RANGE_VALIDATION(kernel_error_);
diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h
index de851bb7093781bea137b89c04a991323809af29..14a4c8d6f4b7438709f1af05d776bec7cb273883 100644
--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -21,6 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -29,10 +30,13 @@
 namespace mace {
 namespace kernels {
 
-struct PadFunctorBase {
-  PadFunctorBase(const std::vector<int> &paddings,
+struct PadFunctorBase : OpKernel {
+  PadFunctorBase(OpKernelContext *context,
+                 const std::vector<int> &paddings,
                  const float constant_value)
-      : paddings_(paddings), constant_value_(constant_value) {}
+      : OpKernel(context),
+        paddings_(paddings),
+        constant_value_(constant_value) {}
 
   std::vector<int> paddings_;
   float constant_value_;
@@ -40,9 +44,10 @@ struct PadFunctorBase {
 
 template<DeviceType D, typename T>
 struct PadFunctor : public PadFunctorBase {
-  PadFunctor(const std::vector<int> &paddings,
+  PadFunctor(OpKernelContext *context,
+             const std::vector<int> &paddings,
              const float constant_value)
-      : PadFunctorBase(paddings, constant_value) {}
+      : PadFunctorBase(context, paddings, constant_value) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
@@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
-  PadFunctor(const std::vector<int> &paddings,
+  PadFunctor(OpKernelContext *context,
+             const std::vector<int> &paddings,
              const float constant_value)
-      : PadFunctorBase(paddings, constant_value) {}
+      : PadFunctorBase(context, paddings, constant_value) {}
 
   MaceStatus operator()(const Tensor *input,
                   Tensor *output,
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 94a388bec7227d3d39f0caa60fddb30a13c059ae..c61745284b2288278be0d9c95076a9ab76af45cb 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/kernel.h"
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -41,14 +42,16 @@ enum PoolingType {
 
 namespace kernels {
 
-struct PoolingFunctorBase {
-  PoolingFunctorBase(const PoolingType pooling_type,
+struct PoolingFunctorBase : OpKernel {
+  PoolingFunctorBase(OpKernelContext *context,
+                     const PoolingType pooling_type,
                      const int *kernels,
                      const int *strides,
                      const Padding padding_type,
                      const std::vector<int> &paddings,
                      const int *dilations)
-      : pooling_type_(pooling_type),
+      : OpKernel(context),
+        pooling_type_(pooling_type),
         kernels_(kernels),
         strides_(strides),
         padding_type_(padding_type),
@@ -68,14 +71,20 @@ struct PoolingFunctor;
 
 template <>
 struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                  const int *kernels,
                  const int *strides,
                  const Padding padding_type,
                  const std::vector<int> &paddings,
                  const int *dilations)
-      : PoolingFunctorBase(
-            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+      : PoolingFunctorBase(context,
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {
   }
 
   void MaxPooling(const float *input,
@@ -231,15 +240,20 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
 
 template <>
 struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                  const int *kernels,
                  const int *strides,
                  const Padding padding_type,
                  const std::vector<int> &paddings,
                  const int *dilations)
-      : PoolingFunctorBase(
-      pooling_type, kernels, strides, padding_type, paddings, dilations) {
-  }
+      : PoolingFunctorBase(context,
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {}
 
   void MaxPooling(const uint8_t *input,
                   const index_t *in_shape,
@@ -443,14 +457,20 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                  const int *kernels,
                  const int *strides,
                  const Padding padding_type,
                  const std::vector<int> &paddings,
                  const int *dilations)
-      : PoolingFunctorBase(
-            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+      : PoolingFunctorBase(context,
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {
   }
   MaceStatus operator()(const Tensor *input_tensor,
                   Tensor *output_tensor,
diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h
index 89f79b7fa702a05de8b7d781c380420915a2ca20..aa002988a53f3145f945b145432da2d21ae34f01 100644
--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -21,6 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 namespace mace {
@@ -121,8 +122,9 @@ inline std::vector<int> nms(const float *bboxes_ptr,
 
 
 template<DeviceType D, typename T>
-struct ProposalFunctor {
-  ProposalFunctor(const int min_size,
+struct ProposalFunctor : OpKernel {
+  ProposalFunctor(OpKernelContext *context,
+                  const int min_size,
                   const float nms_thresh,
                   const int pre_nms_top_n,
                   const int post_nms_top_n,
@@ -130,6 +132,7 @@ struct ProposalFunctor {
                   const int base_size,
                   const std::vector<int> &scales,
                   const std::vector<float> &ratios) :
+      OpKernel(context),
       min_size_(min_size),
       thresh_(nms_thresh),
       pre_nms_top_n_(pre_nms_top_n),
diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h
index 1f1cb8d113ef4e6b996ed97ddacc54097041d023..fe52e8d8a6f60b63b34f4cb25be5b73f3214a211 100644
--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -21,6 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 
 namespace mace {
 namespace kernels {
@@ -173,8 +174,8 @@ template<DeviceType D, typename T>
 struct QuantizeFunctor;
 
 template<>
-struct QuantizeFunctor<CPU, uint8_t> {
-  QuantizeFunctor() {}
+struct QuantizeFunctor<CPU, uint8_t> : OpKernel {
+  explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
 
   MaceStatus operator()(const Tensor *input,
                         const bool non_zero,
@@ -212,8 +213,8 @@ template<DeviceType D, typename T>
 struct DequantizeFunctor;
 
 template<>
-struct DequantizeFunctor<CPU, uint8_t> {
-  DequantizeFunctor() {}
+struct DequantizeFunctor<CPU, uint8_t> : OpKernel {
+  explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
diff --git a/mace/kernels/reduce_mean.h b/mace/kernels/reduce_mean.h
index 65dc67d91c07e3b6f663f67f1135417f2accbc59..71fc2de028e207249cc95c48d82699f77d6c353f 100644
--- a/mace/kernels/reduce_mean.h
+++ b/mace/kernels/reduce_mean.h
@@ -24,6 +24,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
 #endif
@@ -31,10 +32,12 @@
 namespace mace {
 namespace kernels {
 
-struct ReduceFunctorBase {
-  ReduceFunctorBase(const std::vector<int> &axis,
+struct ReduceFunctorBase : OpKernel {
+  ReduceFunctorBase(OpKernelContext *context,
+                    const std::vector<int> &axis,
                     const bool keep_dims)
-      : keep_dims_(keep_dims),
+      : OpKernel(context),
+        keep_dims_(keep_dims),
         axis_(axis) {}
   bool keep_dims_;
   bool reduce_first_axis_;
@@ -44,10 +47,11 @@ struct ReduceFunctorBase {
 };
 
 template <DeviceType D, typename T>
-struct ReduceMeanFunctor : ReduceFunctorBase{
-  ReduceMeanFunctor(const std::vector<int> &axis,
+struct ReduceMeanFunctor : ReduceFunctorBase {
+  ReduceMeanFunctor(OpKernelContext *context,
+                    const std::vector<int> &axis,
                     const bool keep_dims)
-      : ReduceFunctorBase(axis, keep_dims) {}
+      : ReduceFunctorBase(context, axis, keep_dims) {}
 
   void Simplify(const Tensor *input) {
     std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
@@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{
 template <typename T>
 struct ReduceMeanFunctor<DeviceType::GPU, T>
     : ReduceFunctorBase {
-  ReduceMeanFunctor(const std::vector<int> axis,
+  ReduceMeanFunctor(OpKernelContext *context,
+                    const std::vector<int> axis,
                     const bool keep_dims)
-      : ReduceFunctorBase(axis, keep_dims) {}
+      : ReduceFunctorBase(context, axis, keep_dims) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output_tensor,
diff --git a/mace/kernels/reshape.h b/mace/kernels/reshape.h
index cfa7bb2e94012d6cd6cbd78bc81fd31df6472555..f0ab1bf583b226950a9382e3d5b7a78dfa388c0b 100644
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -19,17 +19,14 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
+#include "mace/kernels/kernel.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct ReshapeFunctor {
-  ReshapeFunctor() {}
+struct ReshapeFunctor : OpKernel {
+  explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {}
 
   MaceStatus operator()(const Tensor *input,
                   const std::vector<index_t> &out_shape,
diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h
index b620b51d70822190d74e531d017a7be54c501d74..7245804154910b714e37a63315a7afc4ce40bf22 100644
--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
@@ -21,6 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/utils/logging.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -137,10 +138,11 @@ inline void ResizeImage(const float *images,
   }
 }
 
-struct ResizeBicubicFunctorBase {
-  ResizeBicubicFunctorBase(const std::vector<index_t> &size,
+struct ResizeBicubicFunctorBase : OpKernel {
+  ResizeBicubicFunctorBase(OpKernelContext *context,
+                           const std::vector<index_t> &size,
                            bool align_corners)
-      : align_corners_(align_corners) {
+      : OpKernel(context), align_corners_(align_corners) {
     MACE_CHECK(size.size() == 2);
     out_height_ = size[0];
     out_width_ = size[1];
@@ -158,8 +160,10 @@ struct ResizeBicubicFunctor;
 template<>
 struct ResizeBicubicFunctor<DeviceType::CPU, float>
     : ResizeBicubicFunctorBase {
-  ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBicubicFunctorBase(size, align_corners) {}
+  ResizeBicubicFunctor(OpKernelContext *context,
+                       const std::vector<index_t> &size,
+                       bool align_corners)
+      : ResizeBicubicFunctorBase(context, size, align_corners) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
@@ -204,8 +208,10 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float>
 template<typename T>
 struct ResizeBicubicFunctor<DeviceType::GPU, T>
     : ResizeBicubicFunctorBase {
-  ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBicubicFunctorBase(size, align_corners) {}
+  ResizeBicubicFunctor(OpKernelContext *context,
+                       const std::vector<index_t> &size,
+                       bool align_corners)
+      : ResizeBicubicFunctorBase(context, size, align_corners) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index cb41ef451dcbf25265227c3005b5532759afdced..92e57b4fde5fa39b0a5ae2801b4077633731eae7 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -21,6 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -113,10 +114,12 @@ inline void ResizeImage(const float *images,
   }
 }
 
-struct ResizeBilinearFunctorBase {
-  ResizeBilinearFunctorBase(const std::vector<index_t> &size,
+struct ResizeBilinearFunctorBase : OpKernel {
+  ResizeBilinearFunctorBase(OpKernelContext *context,
+                            const std::vector<index_t> &size,
                             bool align_corners)
-      : align_corners_(align_corners) {
+      : OpKernel(context),
+        align_corners_(align_corners) {
     MACE_CHECK(size.size() == 2);
     out_height_ = size[0];
     out_width_ = size[1];
@@ -134,8 +137,10 @@ struct ResizeBilinearFunctor;
 template<>
 struct ResizeBilinearFunctor<DeviceType::CPU, float>
     : ResizeBilinearFunctorBase {
-  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBilinearFunctorBase(size, align_corners) {}
+  ResizeBilinearFunctor(OpKernelContext *context,
+                        const std::vector<index_t> &size,
+                        bool align_corners)
+      : ResizeBilinearFunctorBase(context, size, align_corners) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
@@ -187,8 +192,10 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
 template<typename T>
 struct ResizeBilinearFunctor<DeviceType::GPU, T>
     : ResizeBilinearFunctorBase {
-  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBilinearFunctorBase(size, align_corners) {}
+  ResizeBilinearFunctor(OpKernelContext *context,
+                        const std::vector<index_t> &size,
+                        bool align_corners)
+      : ResizeBilinearFunctorBase(context, size, align_corners) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
diff --git a/mace/kernels/scalar_math.h b/mace/kernels/scalar_math.h
index 604302074cf30e6b03c1b0c2ded96b2596696b62..928a4954b0b203ea2c338d07713ae411ea91dd17 100644
--- a/mace/kernels/scalar_math.h
+++ b/mace/kernels/scalar_math.h
@@ -89,12 +89,14 @@ void ScalarEltwise(const T* in0,
 
 
 template <DeviceType D, typename T>
-struct ScalarMathFunctor {
-  explicit ScalarMathFunctor(const EltwiseType type,
-                             const std::vector<float> &coeff,
-                             const float scalar_input,
-                             const int32_t scalar_input_index)
-      : type_(type),
+struct ScalarMathFunctor : OpKernel {
+  ScalarMathFunctor(OpKernelContext *context,
+                    const EltwiseType type,
+                    const std::vector<float> &coeff,
+                    const float scalar_input,
+                    const int32_t scalar_input_index)
+      : OpKernel(context),
+        type_(type),
         coeff_(coeff),
         scalar_input_(scalar_input),
         scalar_input_index_(scalar_input_index) {}
diff --git a/mace/kernels/sgemm.h b/mace/kernels/sgemm.h
index 15cec1dd9779166f3c220d3c4f589296ae48d706..3aaf5d478324ed8ec4d32452ceeb39422d89ac1f 100644
--- a/mace/kernels/sgemm.h
+++ b/mace/kernels/sgemm.h
@@ -89,7 +89,7 @@ typedef Major PackOrder;
 template<typename T>
 class PackedBlock {
  public:
-  PackedBlock() : data_tensor_(GetDeviceAllocator(CPU),
+  PackedBlock() : data_tensor_(GetCPUAllocator(),
                                DataTypeToEnum<T>::v()) {}
 
   const T *data() {
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index 5de3ade1ec34b1a71e884bd02436a6cd11b0022a..0c2c91268f4d904daddfe401a166ae8b21a0e7eb 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -27,6 +27,7 @@
 #include "mace/utils/utils.h"
 #include "mace/kernels/fixpoint.h"
 #include "mace/kernels/gemmlowp_util.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/quantize.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -40,7 +41,8 @@ template<DeviceType D, typename T>
 struct SoftmaxFunctor;
 
 template<>
-struct SoftmaxFunctor<DeviceType::CPU, float> {
+struct SoftmaxFunctor<DeviceType::CPU, float> : OpKernel {
+  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
                         StatsFuture *future) {
@@ -127,7 +129,8 @@ static const int kInputDeltaIntBits = 6;
 static const int kSumExpIntBits = 12;
 
 template<>
-struct SoftmaxFunctor<DeviceType::CPU, uint8_t> {
+struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
+  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
                         StatsFuture *future) {
@@ -354,7 +357,8 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> {
 
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct SoftmaxFunctor<DeviceType::GPU, T> {
+struct SoftmaxFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
   MaceStatus operator()(const Tensor *logits,
                         Tensor *output,
                         StatsFuture *future);
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 786e270a41c30cfa7536725d379f2ed652b50ebc..7670632a2620c1d2552097127251ee5d850047d9 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -21,7 +21,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/public/mace.h"
+#include "mace/kernels/kernel.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -30,11 +30,13 @@
 namespace mace {
 namespace kernels {
 
-struct SpaceToBatchFunctorBase {
-  SpaceToBatchFunctorBase(const std::vector<int> &paddings,
+struct SpaceToBatchFunctorBase : OpKernel {
+  SpaceToBatchFunctorBase(OpKernelContext *context,
+                          const std::vector<int> &paddings,
                           const std::vector<int> &block_shape,
                           bool b2s)
-    : paddings_(paddings.begin(), paddings.end()),
+    : OpKernel(context),
+      paddings_(paddings.begin(), paddings.end()),
       block_shape_(block_shape.begin(), block_shape.end()),
       b2s_(b2s) {
     MACE_CHECK(
@@ -135,10 +137,11 @@ struct SpaceToBatchFunctor;
 
 template<>
 struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
-  SpaceToBatchFunctor(const std::vector<int> &paddings,
+  SpaceToBatchFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
                       const std::vector<int> &block_shape,
                       bool b2s)
-    : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
+    : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
 
   MaceStatus operator()(Tensor *space_tensor,
                   Tensor *batch_tensor,
@@ -319,10 +322,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
-  SpaceToBatchFunctor(const std::vector<int> &paddings,
+  SpaceToBatchFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
                       const std::vector<int> &block_shape,
                       bool b2s)
-      : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
+      : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
 
   MaceStatus operator()(Tensor *space_tensor,
                   Tensor *batch_tensor,
diff --git a/mace/kernels/split.h b/mace/kernels/split.h
index 95ff7861142e3f146f461328d04d1d21f2eb5a51..899e74dac04f9de7c11eb2c3e94f01706b464828 100644
--- a/mace/kernels/split.h
+++ b/mace/kernels/split.h
@@ -22,6 +22,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -31,15 +32,17 @@
 namespace mace {
 namespace kernels {
 
-struct SplitFunctorBase {
-  explicit SplitFunctorBase(const int32_t axis) : axis_(axis) {}
+struct SplitFunctorBase : OpKernel {
+  SplitFunctorBase(OpKernelContext *context, const int32_t axis)
+      : OpKernel(context), axis_(axis) {}
 
   int32_t axis_;
 };
 
 template<DeviceType D, typename T>
 struct SplitFunctor : SplitFunctorBase {
-  explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {}
+  SplitFunctor(OpKernelContext *context, const int32_t axis)
+      : SplitFunctorBase(context, axis) {}
 
   MaceStatus operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
@@ -90,11 +93,12 @@ struct SplitFunctor : SplitFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct SplitFunctor<DeviceType::GPU, T> : SplitFunctorBase {
-  explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {}
+  SplitFunctor(OpKernelContext *context, const int32_t axis)
+      : SplitFunctorBase(context, axis) {}
 
   MaceStatus operator()(const Tensor *input,
-                  const std::vector<Tensor *> &output_list,
-                  StatsFuture *future);
+                        const std::vector<Tensor *> &output_list,
+                        StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
   std::unique_ptr<BufferBase> kernel_error_;
diff --git a/mace/kernels/stack.h b/mace/kernels/stack.h
index 9a84bed0a4d5fc41670aa4d7c5cdae4aafb9544b..4d465784ed18e73ccb1084c4666e89786002c6ce 100644
--- a/mace/kernels/stack.h
+++ b/mace/kernels/stack.h
@@ -22,14 +22,16 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct StackFunctor {
-  explicit StackFunctor(int axis) : axis_(axis) {}
+struct StackFunctor : OpKernel {
+  StackFunctor(OpKernelContext *context, int axis)
+      : OpKernel(context), axis_(axis) {}
 
   MaceStatus operator()(const std::vector<const Tensor *> &inputs,
                         Tensor *output,
diff --git a/mace/kernels/strided_slice.h b/mace/kernels/strided_slice.h
index a6afb46c56cd2e500899197836b5803583dc6c06..a5d0eb3828d365f3e38ed4d5f4520e3092997eb8 100644
--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.h
@@ -21,26 +21,29 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct StridedSliceFunctor {
-  StridedSliceFunctor(int begin_mask,
+struct StridedSliceFunctor : OpKernel {
+  StridedSliceFunctor(OpKernelContext *context,
+                      int begin_mask,
                       int end_mask,
                       int ellipsis_mask,
                       int new_axis_mask,
                       int shrink_axis_mask,
                       bool is_slice)
-      : begin_mask_(begin_mask),
+      : OpKernel(context),
+        begin_mask_(begin_mask),
         end_mask_(end_mask),
         ellipsis_mask_(ellipsis_mask),
         new_axis_mask_(new_axis_mask),
         shrink_axis_mask_(shrink_axis_mask),
         is_slice_(is_slice),
-        tmp_strides_tensor_(GetDeviceAllocator(D),
+        tmp_strides_tensor_(context->device()->allocator(),
                             DataTypeToEnum<int32_t>::v()) {}
 
   MaceStatus operator()(const Tensor *input,
diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h
index 8de796aa9259474639c31c37b60a7d6f1439710d..87f9c0e2ab1e9115b520f68ff248b85cbede06e8 100644
--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
@@ -105,8 +105,9 @@ static void TransposeNCHWToNHWCC2(const float *input,
 }
 
 template<DeviceType D, typename T>
-struct TransposeFunctor {
-  explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {}
+struct TransposeFunctor : OpKernel {
+  TransposeFunctor(OpKernelContext *context, const std::vector<int> &dims)
+      : OpKernel(context), dims_(dims) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
diff --git a/mace/kernels/unstack.h b/mace/kernels/unstack.h
index 82b5c467c69180366483672d13eb1e1c9c2a936f..b193c6b5a96455bf670983eb08e505790ad6afee 100644
--- a/mace/kernels/unstack.h
+++ b/mace/kernels/unstack.h
@@ -22,14 +22,16 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct UnstackFunctor {
-  explicit UnstackFunctor(int axis) : axis_(axis) {}
+struct UnstackFunctor : OpKernel {
+  UnstackFunctor(OpKernelContext *context, int axis)
+      : OpKernel(context), axis_(axis) {}
 
   MaceStatus operator()(const Tensor *input,
                         const std::vector<Tensor *> &outputs,
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index c7d6fc1aaf681d6a02e33dfc374da4dadcf6e6fb..c2e267c480f59118c33380aaf342d04ae37f3b3d 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -30,11 +30,13 @@
 namespace mace {
 namespace kernels {
 
-struct WinogradTransformFunctorBase {
-  WinogradTransformFunctorBase(const Padding &padding_type,
+struct WinogradTransformFunctorBase : OpKernel {
+  WinogradTransformFunctorBase(OpKernelContext *context,
+                               const Padding &padding_type,
                                const std::vector<int> &paddings,
                                const int block_size)
-      : strides_({1, 1}),
+      : OpKernel(context),
+        strides_({1, 1}),
         dilations_({1, 1}),
         padding_type_(padding_type),
         paddings_(paddings),
@@ -49,10 +51,14 @@ struct WinogradTransformFunctorBase {
 
 template<DeviceType D, typename T>
 struct WinogradTransformFunctor : WinogradTransformFunctorBase {
-  WinogradTransformFunctor(const Padding &padding_type,
+  WinogradTransformFunctor(OpKernelContext *context,
+                           const Padding &padding_type,
                            const std::vector<int> &paddings,
                            const int block_size)
-      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
+      : WinogradTransformFunctorBase(context,
+                                     padding_type,
+                                     paddings,
+                                     block_size) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
@@ -69,10 +75,14 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
 template<typename T>
 struct WinogradTransformFunctor<DeviceType::GPU, T>
     : WinogradTransformFunctorBase {
-  WinogradTransformFunctor(const Padding &padding_type,
+  WinogradTransformFunctor(OpKernelContext *context,
+                           const Padding &padding_type,
                            const std::vector<int> &paddings,
                            const int block_size)
-      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
+      : WinogradTransformFunctorBase(context,
+                                     padding_type,
+                                     paddings,
+                                     block_size) {}
 
   MaceStatus operator()(const Tensor *input,
                         Tensor *output,
@@ -85,11 +95,13 @@ struct WinogradTransformFunctor<DeviceType::GPU, T>
 };
 #endif  // MACE_ENABLE_OPENCL
 
-struct WinogradInverseTransformFunctorBase {
-  WinogradInverseTransformFunctorBase(const ActivationType activation,
+struct WinogradInverseTransformFunctorBase : OpKernel {
+  WinogradInverseTransformFunctorBase(OpKernelContext *context,
+                                      const ActivationType activation,
                                       const float relux_max_limit,
                                       const int block_size)
-      : wino_blk_size_(block_size),
+      : OpKernel(context),
+        wino_blk_size_(block_size),
         activation_(activation),
         relux_max_limit_(relux_max_limit) {}
 
@@ -100,11 +112,12 @@ struct WinogradInverseTransformFunctorBase {
 
 template<DeviceType D, typename T>
 struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
-  WinogradInverseTransformFunctor(const ActivationType activation,
+  WinogradInverseTransformFunctor(OpKernelContext *context,
+                                  const ActivationType activation,
                                   const float relux_max_limit,
                                   const int block_size)
       : WinogradInverseTransformFunctorBase(
-            activation, relux_max_limit, block_size) {}
+            context, activation, relux_max_limit, block_size) {}
 
   MaceStatus operator()(const std::vector<const Tensor*> &inputs,
                         Tensor *output,
@@ -121,11 +134,12 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
 template <typename T>
 struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
     : WinogradInverseTransformFunctorBase {
-  WinogradInverseTransformFunctor(const ActivationType activation,
+  WinogradInverseTransformFunctor(OpKernelContext *context,
+                                  const ActivationType activation,
                                   const float relux_max_limit,
                                   const int block_size)
       : WinogradInverseTransformFunctorBase(
-            activation, relux_max_limit, block_size) {}
+            context, activation, relux_max_limit, block_size) {}
 
   MaceStatus operator()(const std::vector<const Tensor*> &inputs,
                   Tensor *output,
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 65d7d03c2debab0f78ff185bf3915a1e0f76039c..80a3594363842db02815f25a0b59d33003a9fc75 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -21,10 +21,12 @@
 #include <memory>
 
 #include "mace/core/net.h"
+#include "mace/core/device_context.h"
 #include "mace/ops/ops_register.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -63,9 +65,9 @@ void UnloadModelData(const unsigned char *model_data,
 }
 
 #ifdef MACE_ENABLE_OPENCL
-MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
+MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
   // Check OpenCL avaliable
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = device->opencl_runtime();
   if (!runtime->is_opencl_avaliable()) {
     return MaceStatus::MACE_OUT_OF_RESOURCES;
   }
@@ -101,6 +103,199 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
 
 }  // namespace
 
+class GPUContextBuilder::Impl {
+ public:
+  void SetStoragePath(const std::string &path);
+
+  void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
+
+  void SetOpenCLParameterPath(const std::string &path);
+
+  std::shared_ptr<GPUContext> Finalize();
+
+ public:
+  std::string storage_path_;
+  std::vector<std::string> opencl_binary_paths_;
+  std::string opencl_parameter_path_;
+};
+
+void GPUContextBuilder::Impl::SetStoragePath(const std::string &path) {
+  storage_path_ = path;
+}
+
+void GPUContextBuilder::Impl::SetOpenCLBinaryPaths(
+    const std::vector<std::string> &paths) {
+  opencl_binary_paths_ = paths;
+}
+
+void GPUContextBuilder::Impl::SetOpenCLParameterPath(
+    const std::string &path) {
+  opencl_parameter_path_ = path;
+}
+
+std::shared_ptr<GPUContext> GPUContextBuilder::Impl::Finalize() {
+  return std::shared_ptr<GPUContext>(new GPUContext(storage_path_,
+                                                    opencl_binary_paths_,
+                                                    opencl_parameter_path_));
+}
+
+GPUContextBuilder::GPUContextBuilder() : impl_(new GPUContextBuilder::Impl) {}
+
+GPUContextBuilder::~GPUContextBuilder() = default;
+
+GPUContextBuilder &GPUContextBuilder::SetStoragePath(const std::string &path) {
+  impl_->SetStoragePath(path);
+  return *this;
+}
+
+GPUContextBuilder &GPUContextBuilder::SetOpenCLBinaryPaths(
+    const std::vector<std::string> &paths) {
+  impl_->SetOpenCLBinaryPaths(paths);
+  return *this;
+}
+
+GPUContextBuilder &GPUContextBuilder::SetOpenCLParameterPath(
+    const std::string &path) {
+  impl_->SetOpenCLParameterPath(path);
+  return *this;
+}
+
+std::shared_ptr<GPUContext> GPUContextBuilder::Finalize() {
+  return impl_->Finalize();
+}
+
+class MaceEngineConfig::Impl {
+ public:
+  explicit Impl(const DeviceType device_type);
+  ~Impl() = default;
+
+  MaceStatus SetGPUContext(std::shared_ptr<GPUContext> context);
+
+  MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
+
+  MaceStatus SetCPUThreadPolicy(int num_threads_hint,
+                                CPUAffinityPolicy policy,
+                                bool use_gemmlowp);
+
+  MaceStatus SetOpenMPThreadAffinity(int num_threads,
+                                     const std::vector<int> &cpu_ids);
+
+  inline DeviceType device_type() const {
+    return device_type_;
+  }
+
+  inline int num_threads() const {
+    return num_threads_;
+  }
+
+  inline std::shared_ptr<GPUContext> gpu_context() const {
+    return gpu_context_;
+  }
+
+  inline GPUPriorityHint gpu_priority_hint() const {
+    return gpu_priority_hint_;
+  }
+
+  inline GPUPerfHint gpu_perf_hint() const {
+    return gpu_perf_hint_;
+  }
+
+ private:
+  DeviceType device_type_;
+  int num_threads_;
+  std::shared_ptr<GPUContext> gpu_context_;
+  GPUPriorityHint gpu_priority_hint_;
+  GPUPerfHint gpu_perf_hint_;
+};
+
+MaceEngineConfig::Impl::Impl(const DeviceType device_type)
+    : device_type_(device_type),
+      num_threads_(-1),
+      gpu_context_(new GPUContext),
+      gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
+      gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
+
+MaceStatus MaceEngineConfig::Impl::SetGPUContext(
+    std::shared_ptr<GPUContext> context) {
+  gpu_context_ = context;
+  return MACE_SUCCESS;
+}
+
+MaceStatus MaceEngineConfig::Impl::SetGPUHints(
+    GPUPerfHint perf_hint,
+    GPUPriorityHint priority_hint) {
+  gpu_perf_hint_ = perf_hint;
+  gpu_priority_hint_ = priority_hint;
+  return MACE_SUCCESS;
+}
+
+MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
+    int num_threads,
+    CPUAffinityPolicy policy,
+    bool use_gemmlowp) {
+  num_threads_ = num_threads;
+  return mace::SetOpenMPThreadsAndAffinityPolicy(
+      num_threads, policy, use_gemmlowp);
+}
+
+MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity(
+    int num_threads,
+    const std::vector<int> &cpu_ids) {
+  num_threads_ = num_threads;
+  return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
+}
+
+
+MaceEngineConfig::MaceEngineConfig(
+    const DeviceType device_type)
+    : impl_(new MaceEngineConfig::Impl(device_type)) {}
+
+MaceEngineConfig::~MaceEngineConfig() = default;
+
+MaceStatus MaceEngineConfig::SetGPUContext(
+    std::shared_ptr<GPUContext> context) {
+  return impl_->SetGPUContext(context);
+}
+
+MaceStatus MaceEngineConfig::SetGPUHints(
+    GPUPerfHint perf_hint,
+    GPUPriorityHint priority_hint) {
+  return impl_->SetGPUHints(perf_hint, priority_hint);
+}
+
+MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
+    int num_threads_hint,
+    CPUAffinityPolicy policy,
+    bool use_gemmlowp) {
+  return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
+}
+
+MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity(
+    int num_threads,
+    const std::vector<int> &cpu_ids) {
+  return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids);
+}
+
+DeviceType MaceEngineConfig::device_type() const {
+  return impl_->device_type();
+}
+
+int MaceEngineConfig::num_threads() const {
+  return impl_->num_threads();
+}
+
+std::shared_ptr<GPUContext> MaceEngineConfig::gpu_context() const {
+  return impl_->gpu_context();
+}
+
+GPUPerfHint MaceEngineConfig::gpu_perf_hint() const {
+  return impl_->gpu_perf_hint();
+}
+
+GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const {
+  return impl_->gpu_priority_hint();
+}
+
 // Mace Tensor
 class MaceTensor::Impl {
  public:
@@ -155,7 +350,7 @@ std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
 // Mace Engine
 class MaceEngine::Impl {
  public:
-  explicit Impl(DeviceType device_type);
+  explicit Impl(const MaceEngineConfig &config);
 
   ~Impl();
 
@@ -178,6 +373,7 @@ class MaceEngine::Impl {
   size_t model_data_size_;
   std::shared_ptr<OperatorRegistryBase> op_registry_;
   DeviceType device_type_;
+  std::unique_ptr<Device> device_;
   std::unique_ptr<Workspace> ws_;
   std::unique_ptr<NetBase> net_;
   std::map<std::string, mace::InputInfo> input_info_map_;
@@ -189,11 +385,12 @@ class MaceEngine::Impl {
   MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
 
-MaceEngine::Impl::Impl(DeviceType device_type)
+MaceEngine::Impl::Impl(const MaceEngineConfig &config)
     : model_data_(nullptr),
       model_data_size_(0),
       op_registry_(new OperatorRegistry()),
-      device_type_(device_type),
+      device_type_(config.device_type()),
+      device_(nullptr),
       ws_(new Workspace()),
       net_(nullptr)
 #ifdef MACE_ENABLE_HEXAGON
@@ -201,6 +398,19 @@ MaceEngine::Impl::Impl(DeviceType device_type)
 #endif
 {
   LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
+  if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) {
+    device_.reset(new CPUDevice(config.num_threads()));
+  }
+#ifdef MACE_ENABLE_OPENCL
+  if (device_type_ == DeviceType::GPU) {
+    device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(),
+                                config.gpu_context()->opencl_cache_storage(),
+                                config.gpu_priority_hint(),
+                                config.gpu_perf_hint(),
+                                config.gpu_context()->opencl_binary_storage(),
+                                config.num_threads()));
+  }
+#endif
 }
 
 MaceStatus MaceEngine::Impl::Init(
@@ -212,7 +422,7 @@ MaceStatus MaceEngine::Impl::Init(
   // Check avalibility
 #ifdef MACE_ENABLE_OPENCL
   if (device_type_ == DeviceType::GPU) {
-    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def));
+    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
   }
 #endif
   // Get input and output information.
@@ -230,7 +440,7 @@ MaceStatus MaceEngine::Impl::Init(
                  << MakeString(MapKeys(input_info_map_));
     }
     ws_->CreateTensor(MakeString("mace_input_node_", input_name),
-                      GetDeviceAllocator(device_type_), DT_FLOAT);
+                      device_->allocator(), DT_FLOAT);
   }
   for (auto output_name : output_nodes) {
     if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -239,7 +449,7 @@ MaceStatus MaceEngine::Impl::Init(
                  << MakeString(MapKeys(output_info_map_));
     }
     ws_->CreateTensor(MakeString("mace_output_node_", output_name),
-                      GetDeviceAllocator(device_type_), DT_FLOAT);
+                      device_->allocator(), DT_FLOAT);
   }
 #ifdef MACE_ENABLE_HEXAGON
   if (device_type_ == HEXAGON) {
@@ -255,19 +465,20 @@ MaceStatus MaceEngine::Impl::Init(
     }
   } else {
 #endif
-    MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
-        *net_def, device_type_, model_data));
+    MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
+                                              device_.get(),
+                                              model_data));
 
     // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
+    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_.get(),
                          NetMode::INIT);
     MACE_RETURN_IF_ERROR(net->Run());
-    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
+    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_.get());
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
   if (device_type_ == DeviceType::GPU) {
-    ws_->RemoveAndReloadBuffer(*net_def, model_data);
+    ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
   }
   return MaceStatus::MACE_SUCCESS;
 }
@@ -360,7 +571,7 @@ MaceStatus MaceEngine::Impl::Run(
 
 #ifdef MACE_ENABLE_OPENCL
   if (device_type_ == GPU) {
-    OpenCLRuntime::Global()->SaveBuiltCLProgram();
+    device_->opencl_runtime()->SaveBuiltCLProgram();
   }
 #endif
   for (auto &output : *outputs) {
@@ -385,8 +596,8 @@ MaceStatus MaceEngine::Impl::Run(
   return MACE_SUCCESS;
 }
 
-MaceEngine::MaceEngine(DeviceType device_type):
-    impl_(new MaceEngine::Impl(device_type)) {}
+MaceEngine::MaceEngine(const MaceEngineConfig &config):
+    impl_(new MaceEngine::Impl(config)) {}
 
 MaceEngine::~MaceEngine() = default;
 
@@ -421,7 +632,7 @@ MaceStatus CreateMaceEngineFromProto(
     const std::string &model_data_file,
     const std::vector<std::string> &input_nodes,
     const std::vector<std::string> &output_nodes,
-    const DeviceType device_type,
+    const MaceEngineConfig &config,
     std::shared_ptr<MaceEngine> *engine) {
   LOG(INFO) << "Create MaceEngine from model pb";
   // load model
@@ -432,7 +643,7 @@ MaceStatus CreateMaceEngineFromProto(
   std::shared_ptr<NetDef> net_def(new NetDef());
   net_def->ParseFromArray(&model_pb[0], model_pb.size());
 
-  engine->reset(new mace::MaceEngine(device_type));
+  engine->reset(new mace::MaceEngine(config));
   MaceStatus status = (*engine)->Init(
       net_def.get(), input_nodes, output_nodes, model_data_file);
 
diff --git a/mace/libmace/mace_runtime.cc b/mace/libmace/mace_runtime.cc
deleted file mode 100644
index 24b2cd8f32b04dc14a425cbd73945f6d7851a4a8..0000000000000000000000000000000000000000
--- a/mace/libmace/mace_runtime.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/core/macros.h"
-#include "mace/core/file_storage.h"
-#include "mace/core/runtime/cpu/cpu_runtime.h"
-#include "mace/public/mace_runtime.h"
-#include "mace/utils/logging.h"
-
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#endif  // MACE_ENABLE_OPENCL
-
-namespace mace {
-
-class FileStorageFactory::Impl {
- public:
-  explicit Impl(const std::string &path);
-
-  std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
-
- private:
-  std::string path_;
-};
-
-FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
-
-std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
-    const std::string &name) {
-  return std::move(std::unique_ptr<KVStorage>(
-      new FileStorage(path_ + "/" + name)));
-}
-
-FileStorageFactory::FileStorageFactory(const std::string &path):
-    impl_(new FileStorageFactory::Impl(path)) {}
-
-FileStorageFactory::~FileStorageFactory() = default;
-
-std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
-    const std::string &name) {
-  return impl_->CreateStorage(name);
-}
-
-extern std::shared_ptr<KVStorageFactory> kStorageFactory;
-
-void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory) {
-  VLOG(1) << "Set internal KV Storage Engine";
-  kStorageFactory = storage_factory;
-}
-
-// Set OpenCL Compiled Binary paths, just call once. (Not thread-safe)
-void SetOpenCLBinaryPaths(const std::vector<std::string> &paths) {
-#ifdef MACE_ENABLE_OPENCL
-  OpenCLRuntime::ConfigureOpenCLBinaryPath(paths);
-#else
-  MACE_UNUSED(paths);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-extern std::string kOpenCLParameterPath;
-
-void SetOpenCLParameterPath(const std::string &path) {
-#ifdef MACE_ENABLE_OPENCL
-  kOpenCLParameterPath = path;
-#else
-  MACE_UNUSED(path);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) {
-#ifdef MACE_ENABLE_OPENCL
-  VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
-          << ", gpu_priority_hint: " << gpu_priority_hint;
-  OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
-#else
-  MACE_UNUSED(gpu_perf_hint);
-  MACE_UNUSED(gpu_priority_hint);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
-                                 CPUAffinityPolicy policy,
-                                 bool use_gemmlowp) {
-  VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint
-          << ", affinity policy: " << policy;
-  return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint,
-                                           policy,
-                                           use_gemmlowp);
-}
-
-MaceStatus SetOpenMPThreadAffinity(int num_threads,
-                                   const std::vector<int> &cpu_ids) {
-  return SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
-}
-
-MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                               std::vector<int> *little_core_ids) {
-  return GetCPUBigLittleCoreIDs(big_core_ids, little_core_ids);
-}
-
-
-};  // namespace mace
diff --git a/mace/libmace/mace_version_script.lds b/mace/libmace/mace_version_script.lds
index 76d8f1c2c553e09ac7aaa054999f4af543baa7b1..4bdc33db9f6162924285c52ecc9bbe76435d487d 100644
--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
@@ -1,15 +1,10 @@
 mace {
   global:
+    *GPUContextBuilder*;
+    *MaceEngineConfig*;
     *MaceTensor*;
     *MaceEngine*;
     *CreateMaceEngineFromProto*;
-    *FileStorageFactory*;
-    *SetKVStorageFactory*;
-    *SetOpenCLBinaryPaths*;
-    *SetOpenCLParameterPath*;
-    *SetGPUHints*;
-    *SetOpenMPThreadPolicy*;
-    *SetOpenMPThreadAffinity*;
     *GetBigLittleCoreIDs*;
     *MaceVersion*;
 
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 07aad1d24b549c4be26a8521549e68aba50662c8..312bdc90babe7d04574a6823455893085771212e 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -23,8 +23,25 @@ cc_library(
     hdrs = [
         "ops_test_util.h",
     ],
+    srcs = [
+        "ops_test_util.cc",
+    ],
+    copts = [
+        "-Werror",
+        "-Wextra",
+    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
     deps = [
-        "//mace/core",
+        "//mace/ops",
         "@gtest",
     ],
 )
@@ -36,6 +53,7 @@ cc_library(
         exclude = [
             "*_test.cc",
             "*_benchmark.cc",
+            "ops_test_util.cc",
             "buffer_to_image.cc",
             "image_to_buffer.cc",
             "lstmcell.cc",
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index 8938ea74b5dab1fb112a0843922aacac8c8b67cd..3b48891e769b7133f9b780a66d6ada4760b4ee7e 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ActivationOp : public Operator<D, T> {
  public:
-  ActivationOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(kernels::StringToActivationType(
+  ActivationOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 kernels::StringToActivationType(
                      OperatorBase::GetOptionalArg<std::string>("activation",
                                                                "NOOP")),
                  static_cast<T>(
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index cc40ac9d7e2167c178ee10644b57d886fbc58289..49422f3a11969c89e1c55453bbfc40e44f797bdc 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -58,7 +58,7 @@ void TestSimpleRelu() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -106,7 +106,7 @@ void TestUnalignedSimpleRelu() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
+  auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -159,7 +159,7 @@ void TestSimpleRelux() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -209,7 +209,7 @@ void TestSimpleReluRelux() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -267,7 +267,7 @@ void TestSimplePrelu() {
   }
 
   if (D == DeviceType::CPU) {
-    auto expected = CreateTensor<float>(
+    auto expected = net.CreateTensor<float>(
         {2, 2, 2, 2},
         {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -318,7 +318,7 @@ void TestSimpleTanh() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2},
       {-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092,
        -0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758,
@@ -371,7 +371,7 @@ void TestSimpleSigmoid() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2},
       {9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01,
        6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01,
diff --git a/mace/ops/addn.h b/mace/ops/addn.h
index 64373343363ff620d34ea735078c1291b7450616..4238a013e455723f9ad88cbdec8dee79be862885 100644
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
  public:
-  AddNOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {}
+  AddNOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context), functor_(context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     Tensor *output_tensor = this->Output(0);
diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc
index 2f5aa28a78c575733337e48c721ae486ec8a9ce7..7154ad52d097a9c09144f0b1d1630ca8be538e20 100644
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -39,7 +39,7 @@ void SimpleAdd2() {
   // Run
   net.RunOp(D);
 
-  auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
+  auto expected = net.CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -98,7 +98,7 @@ void SimpleAdd3() {
   }
 
   auto expected =
-      CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
+      net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3);
 }
@@ -136,8 +136,8 @@ void RandomTest() {
     // run on cpu
     net.RunOp();
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
     for (int i = 0; i < input_num; ++i) {
@@ -160,7 +160,7 @@ void RandomTest() {
     ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                             kernels::BufferType::IN_OUT_CHANNEL);
 
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-2);
   }
 }
diff --git a/mace/ops/argmax.h b/mace/ops/argmax.h
index ce493059387fc0d6aff802b7db053b9e47c8cfcb..b1d7ec4efc4d7d448eb6676d838730bfd5450386 100644
--- a/mace/ops/argmax.h
+++ b/mace/ops/argmax.h
@@ -26,8 +26,8 @@ namespace ops {
 template<DeviceType D, class T>
 class ArgMaxOp : public Operator<D, T> {
  public:
-  ArgMaxOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {}
+  ArgMaxOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context), functor_(context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(0);
diff --git a/mace/ops/argmax_test.cc b/mace/ops/argmax_test.cc
index bf00b57933969f394c61593baa10518085b3c92a..ca7ece351801ef781edbd04826a2fb285ee1f77c 100644
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
@@ -47,7 +47,7 @@ void ArgMaxTest(const std::vector<index_t> &input_shape,
   }
 
   // Check
-  auto expected = CreateTensor<int32_t>(output_shape, output);
+  auto expected = net.CreateTensor<int32_t>(output_shape, output);
   ExpectTensorNear<int32_t>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
index 9d983f10a6105d144f0fd1366814072327765a64..7221c3ca1f10b535d1e570f4356e3720ac298a7d 100644
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -25,9 +25,9 @@ namespace ops {
 template <DeviceType D, class T>
 class BatchNormOp : public Operator<D, T> {
  public:
-  BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(false, kernels::ActivationType::NOOP, 0.0f) {
+  BatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context, false, kernels::ActivationType::NOOP, 0.0f) {
     epsilon_ = OperatorBase::GetOptionalArg<float>("epsilon",
                                                    static_cast<float>(1e-4));
   }
@@ -52,7 +52,8 @@ class BatchNormOp : public Operator<D, T> {
 
     Tensor *output = this->Output(OUTPUT);
     MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    return functor_(input, scale, offset, mean, var, epsilon_, output, future);
+    return functor_(input, scale, offset,
+                    mean, var, epsilon_, output, future);
   }
 
  private:
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index b72ec73ad0e5a39da814ce29bc407e82b7e6e41c..7d5b77daf1eb3cdb7a4402f83657421618ff2f44 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -79,7 +79,7 @@ void Simple() {
   }
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
                      3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
 
@@ -130,8 +130,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -166,7 +166,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 
 TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
@@ -208,8 +209,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -245,7 +246,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-1, 1e-2);
 }
 
 TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
@@ -287,8 +289,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -323,7 +325,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 
 TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
@@ -365,8 +368,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -402,7 +405,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-1, 1e-2);
 }
 
 }  // namespace test
diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h
index 91c4a9ba8a929765d187c28777f02db06adc4e1b..fa1ed2c6a2534b62795cd3d2c541f722795ff9de 100644
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -27,9 +27,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class BatchToSpaceNDOp : public Operator<D, T> {
  public:
-  BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
+  BatchToSpaceNDOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
                  OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
                  true) {}
 
diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h
index 901c1e74235b2b080730955082d68d0861c0b201..ee3de99116fea2a49153c2d1f79a73b570f8b02d 100644
--- a/mace/ops/bias_add.h
+++ b/mace/ops/bias_add.h
@@ -24,10 +24,11 @@ namespace ops {
 template <DeviceType D, class T>
 class BiasAddOp : public Operator<D, T> {
  public:
-  BiasAddOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
-            "data_format", NHWC))) {}
+  BiasAddOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
+                     "data_format", NHWC))) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc
index c41584542c01ab2ea5594cca1242a0bf2242c596..51c8cc8871f370f878025b919bde91d92d39fba1 100644
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -66,7 +66,7 @@ void BiasAddSimple() {
   }
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 6, 2, 1},
       {5.5, 5.5, 7.5, 7.5, 9.5, 9.5, 11.5, 11.5, 13.5, 13.5, 15.5, 15.5});
 
@@ -111,8 +111,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -132,7 +132,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 
 TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
@@ -167,8 +167,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                   NHWC);
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -188,7 +188,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 
 }  // namespace test
diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h
index 7c59c822d2e19f129248ddd76f3fd9bc69a5fe74..0fa34c30f52a339de00e5f1d5efd28fe844a433b 100644
--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class BufferToImageOp : public Operator<D, T> {
  public:
-  BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
+  BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
diff --git a/mace/ops/cast.h b/mace/ops/cast.h
index cee022ec4aedb9b848e9dc46b3e564e561c08b36..56d20d52cb97952476b46c993bba6024f59109c2 100644
--- a/mace/ops/cast.h
+++ b/mace/ops/cast.h
@@ -25,8 +25,8 @@ namespace ops {
 template <DeviceType D, typename SrcType>
 class CastOp : public Operator<D, SrcType> {
  public:
-  CastOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, SrcType>(op_def, ws) {}
+  CastOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, SrcType>(op_def, context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     MACE_UNUSED(future);
diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h
index bd9234c1abab8c3c6391f781a4b7177c1a82d5b1..a459a0b38e115ace4e4333ce5ca3dc5539f61afe 100644
--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -26,10 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class ChannelShuffleOp : public Operator<D, T> {
  public:
-  ChannelShuffleOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
+  ChannelShuffleOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
         group_(OperatorBase::GetOptionalArg<int>("group", 1)),
-        functor_(this->group_) {}
+        functor_(context, this->group_) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc
index 0b674dab57b6a24feb81eed7bba64415a969ecb3..2102fe7652b2b552d8f9c8caeb09abfa786c1a57 100644
--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -45,7 +45,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -77,7 +77,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
                                         kernels::BufferType::IN_OUT_CHANNEL);
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 1, 2, 16},
       {0,  4,  8,  12, 1,  5,  9,  13, 2,  6,  10, 14, 3,  7,  11, 15,
        16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31});
diff --git a/mace/ops/concat.h b/mace/ops/concat.h
index be76371494a2116f180420ddadf75090bb103b54..94dee3d33dd8876183bb9934874b6f1cd4d2766f 100644
--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, typename T>
 class ConcatOp : public Operator<D, T> {
  public:
-  ConcatOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 3)) {}
+  ConcatOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
 
   MaceStatus Run(StatsFuture *future) override {
     MACE_CHECK(this->InputSize() >= 2)
diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h
index b15045cd18884e7112c19da7a6c5bdeab53560f0..5864e1edb0ad1bc4eed4c9db9d1411ea1a6499c2 100644
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -28,9 +28,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class Conv2dOp : public ConvPool2dOpBase<D, T> {
  public:
-  Conv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
-        functor_(this->strides_.data(),
+  Conv2dOp(const OperatorDef &op_def, OpKernelContext *context)
+      : ConvPool2dOpBase<D, T>(op_def, context),
+        functor_(context,
+                 this->strides_.data(),
                  this->padding_type_,
                  this->paddings_,
                  this->dilations_.data(),
@@ -40,7 +41,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                  OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
                  static_cast<bool>(OperatorBase::GetOptionalArg<int>(
                      "is_filter_transformed", false)),
-                 ws->GetScratchBuffer(D)) {}
+                 context->workspace()->GetScratchBuffer(D)) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index ecfdafa2da6d257a0762dd7f06c71968c4348834..dd338e275356fd1bfab3cc21b50fade2b77da46e 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -84,7 +84,7 @@ void TestNHWCSimple3x3VALID() {
     MACE_NOT_IMPLEMENTED;
   }
 
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.1f});
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 
@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() {
     MACE_NOT_IMPLEMENTED;
   }
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 3, 3, 1},
       {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
 
@@ -221,7 +221,7 @@ void TestNHWCSimple3x3WithoutBias() {
   }
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.0f});
 
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -298,7 +298,7 @@ void TestNHWCCombined3x3() {
   }
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
                      9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -374,7 +374,7 @@ void TestFusedNHWCSimple3x3VALID() {
     MACE_NOT_IMPLEMENTED;
   }
 
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
 template <DeviceType D, typename T>
@@ -434,7 +434,7 @@ void TestFusedNHWCSimple3x3WithoutBias() {
   }
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
@@ -515,7 +515,7 @@ void TestConv1x1() {
   }
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 3, 10, 2},
       {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
        5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
@@ -576,8 +576,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
                                                     "Output", NHWC);
 
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -602,7 +602,7 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                         kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
 
@@ -685,8 +685,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                                     "Output", NHWC);
 
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
     BufferToImage<D, half>(&net, "Input", "InputImage",
@@ -712,7 +712,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
     ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                             kernels::BufferType::IN_OUT_CHANNEL);
 
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-1);
   };
 
@@ -837,8 +837,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
                                                     "Output", NHWC);
 
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -863,7 +863,7 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                         kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
 
@@ -934,8 +934,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
     BufferToImage<D, half>(&net, "Input", "InputImage",
@@ -960,7 +960,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
 
     ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                             kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-1);
   };
 
@@ -1021,8 +1021,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
                                                     "Output", NHWC);
 
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -1046,7 +1046,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                         kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
 
@@ -1104,7 +1104,7 @@ void TestQuantSimple3x3() {
   // Run
   net.Run();
   // Check
-  auto expected = CreateTensor<uint8_t>({1, 1, 1, 1}, {230});
+  auto expected = net.CreateTensor<uint8_t>({1, 1, 1, 1}, {230});
   ExpectTensorNear<uint8_t>(*expected, *output);
 }
 
diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h
index 9c4860df735b1d59cc744ce18abb434f9a166c3b..0a8a8c174617dd0474ec4bdc8e82375c291f5f2a 100644
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, class T>
 class ConvPool2dOpBase : public Operator<D, T> {
  public:
-  ConvPool2dOpBase(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
+  ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
         strides_(OperatorBase::GetRepeatedArgs<int>("strides")),
         padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
             "padding", static_cast<int>(SAME)))),
diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc
index ac184c80ba3295ccc4ab3f553940c0f485972030..8eecd77dca5d2ae149b3a3711b5c1d3cd631d6ad 100644
--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -21,6 +21,8 @@ namespace test {
 TEST(CoreTest, INIT_MODE) {
   std::vector<OperatorDef> op_defs;
 
+  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
+  std::unique_ptr<Tuner<uint32_t>> tuner;
   Workspace ws;
 
   op_defs.emplace_back(OperatorDef());
@@ -31,7 +33,7 @@ TEST(CoreTest, INIT_MODE) {
       .AddIntArg("mode", static_cast<int>(NetMode::INIT))
       .Finalize(&op_defs[op_defs.size() - 1]);
 
-  Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
+  Tensor *input = ws.CreateTensor("Input", device->allocator(),
                                   DataTypeToEnum<float>::v());
   input->Resize({1, 3, 3, 3});
   {
@@ -53,13 +55,13 @@ TEST(CoreTest, INIT_MODE) {
   }
   std::shared_ptr<OperatorRegistryBase> op_registry(new OperatorRegistry());
   auto net =
-      CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT);
+      CreateNet(op_registry, net_def, &ws, device, NetMode::INIT);
   net->Run();
 
   EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
   EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
 
-  net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU);
+  net = CreateNet(op_registry, net_def, &ws, device);
   net->Run();
   EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
 
diff --git a/mace/ops/crop.h b/mace/ops/crop.h
index f1f179b9457786e083ac9e04a7ee5231b5cfba40..f50450693580a0d193cac1975e5903e1e624cfd5 100644
--- a/mace/ops/crop.h
+++ b/mace/ops/crop.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class CropOp : public Operator<D, T> {
  public:
-  CropOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 2),
+  CropOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("axis", 2),
                  OperatorBase::GetRepeatedArgs<int>("offset")) {}
 
   MaceStatus Run(StatsFuture *future) override {
diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc
index a28205b9301cf5e8f543fee457bcedec45d515e6..b4bb7fddf1e1c630e5ad3897fcd7558fe5c5662c 100644
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -75,7 +75,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
                                                     "Output", NHWC);
   }
   // Check
-  auto expected = CreateTensor<float>(expected_shape, expected_data);
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
 }  // namespace
diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h
index 188b8ba04438532d14acda575899ba29e2d16353..ec5b348e201f4048398ea2f3b8f69fca63c5337a 100644
--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class Deconv2dOp : public Operator<D, T> {
  public:
-  Deconv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetRepeatedArgs<int>("strides"),
+  Deconv2dOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("strides"),
                  static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
                      "padding", static_cast<int>(SAME))),
                  OperatorBase::GetRepeatedArgs<int>("padding_values"),
diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc
index 954d6bf41a69851ca2f872c684ee3e9a96b96610..67d0ac141e77218d2a047c4b472f1e13e661c8b8 100644
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -79,7 +79,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                                                     "Output", NHWC);
   }
 
-  auto expected = CreateTensor<float>(expected_shape, expected_data);
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001);
 }
 
@@ -350,8 +350,8 @@ void TestComplexDeconvNxNS12(const int batch,
                                                     "Output", NHWC);
 
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -377,7 +377,7 @@ void TestComplexDeconvNxNS12(const int batch,
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                         kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
 
diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h
index 4be3f2a0dc08128eec9ca7141df414ab73c9bf81..49183873733cd4d878ad1113f64c76aa918744cd 100644
--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
@@ -27,10 +27,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class DepthToSpaceOp : public Operator<D, T> {
  public:
-  DepthToSpaceOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
+  DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
         block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)),
-        functor_(this->block_size_, true) {}
+        functor_(context, this->block_size_, true) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc
index e61590ff9b28ab73d1cb7559cb9dc3b3622fe842..99c4fb0b6e4bf05a4e2c502731136966cabdd07e 100644
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -64,7 +64,7 @@ void RunDepthToSpace(const bool d2s,
     ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
                                           kernels::BufferType::IN_OUT_CHANNEL);
   }
-  auto expected = CreateTensor<float>(expected_shape, expected_data);
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h
index 2762aea5f48114413b90cc0250ab010de4486244..549af07a3977b65464288a96096b42cb22c2ac6d 100644
--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -29,9 +29,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
  public:
-  DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
-        functor_(this->strides_.data(),
+  DepthwiseConv2dOp(const OperatorDef &op_def, OpKernelContext *context)
+      : ConvPool2dOpBase<D, T>(op_def, context),
+        functor_(context,
+                 this->strides_.data(),
                  this->padding_type_,
                  this->paddings_,
                  this->dilations_.data(),
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index a2d57911db1b7c136c87ec5c7b5ac6616f6ce289..6d6b84f1f79356d9f1eb6411fc564c912fca3e1d 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -80,7 +80,7 @@ void SimpleValidTest() {
   }
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 2, 2, 2},
       {37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f});
 
@@ -212,7 +212,7 @@ void ComplexValidTest(index_t batch,
   }
 
   auto expected =
-      CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
+      net.CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -284,8 +284,8 @@ void TestNxNS12(const index_t height, const index_t width) {
                                                     "Output", NHWC);
 
     // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                       kernels::BufferType::IN_OUT_CHANNEL);
@@ -312,10 +312,10 @@ void TestNxNS12(const index_t height, const index_t width) {
 
     // Check
     if (DataTypeToEnum<T>::value == DT_FLOAT) {
-      ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
+      ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
                               1e-4);
     } else {
-      ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2,
+      ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-2,
                               1e-2);
     }
   };
@@ -387,7 +387,7 @@ void QuantSimpleValidTest() {
   net.Run();
 
   // Check
-  auto expected = CreateTensor<uint8_t>({1, 1, 1, 2}, {255, 21});
+  auto expected = net.CreateTensor<uint8_t>({1, 1, 1, 2}, {255, 21});
 
   ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"));
 }
diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h
index 161d0e4fd9b5dba81d5d9d504ecd4a608edbedd4..f795256218eed2087d372e1acdbe5ba1db2fce96 100644
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class EltwiseOp : public Operator<D, T> {
  public:
-  EltwiseOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
+  EltwiseOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
         functor_(
+            context,
             static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
                 "type", static_cast<int>(kernels::EltwiseType::NONE))),
             OperatorBase::GetRepeatedArgs<float>("coeff"),
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index 55a0ce977563c16e8e3914a5c435fed7cedbbabc..76b04f3423a31fd344edb4cadee02857dcc4a71a 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -49,7 +49,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type,
     MACE_NOT_IMPLEMENTED;
   }
 
-  auto expected = CreateTensor<DstType>({}, {output});
+  auto expected = net.CreateTensor<DstType>({}, {output});
 
   ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -97,7 +97,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
                               kernels::BufferType::IN_OUT_CHANNEL);
   }
 
-  auto expected = CreateTensor<DstType>(shape, output);
+  auto expected = net.CreateTensor<DstType>(shape, output);
 
   ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -167,7 +167,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
   if (input0.size() < input1.size()) {
     output_shape = shape1;
   }
-  auto expected = CreateTensor<DstType>(output_shape, output);
+  auto expected = net.CreateTensor<DstType>(output_shape, output);
 
   ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -206,7 +206,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
     MACE_NOT_IMPLEMENTED;
   }
 
-  auto expected = CreateTensor<DstType>(output_shape, output);
+  auto expected = net.CreateTensor<DstType>(output_shape, output);
   ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
@@ -476,8 +476,8 @@ void RandomTensorScalar(const kernels::EltwiseType type,
   net.RunOp(DeviceType::CPU);
   net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
                                                   NHWC);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
                                     kernels::BufferType::IN_OUT_CHANNEL);
@@ -496,9 +496,9 @@ void RandomTensorScalar(const kernels::EltwiseType type,
                                         kernels::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
   }
 }
 
@@ -531,8 +531,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
   net.RunOp(DeviceType::CPU);
   net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
                                                   NHWC);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
                                     kernels::BufferType::IN_OUT_CHANNEL);
@@ -554,9 +554,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
                                         kernels::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
   }
 }
 }  // namespace
diff --git a/mace/ops/fill.h b/mace/ops/fill.h
index a8b55dbe8984f2d6f87e39e1d39373e9ad909b58..b6836d11978d7263439b03eda7b072feacf06c19 100644
--- a/mace/ops/fill.h
+++ b/mace/ops/fill.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class FillOp : public Operator<D, T> {
  public:
-  FillOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_() {}
+  FillOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *shape = this->Input(SHAPE);
diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h
index 9cd76c738b12282ec7cff8974cd48923405c4910..345d87b476ded184fa7b02ba8c47072589e41bc6 100644
--- a/mace/ops/folded_batch_norm.h
+++ b/mace/ops/folded_batch_norm.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, class T>
 class FoldedBatchNormOp : public Operator<D, T> {
  public:
-  FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(true,
+  FoldedBatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 true,
                  kernels::StringToActivationType(
                      OperatorBase::GetOptionalArg<std::string>("activation",
                                                                "NOOP")),
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index 3979583a1384dd962d850f912ee3546984c7cb76..16a6ad684809436832569a285158105e3b9137f2 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -83,7 +83,7 @@ void Simple() {
   }
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
                      3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
 
@@ -129,8 +129,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -153,7 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 
 TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
@@ -190,8 +191,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -215,7 +216,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-2, 1e-2);
 }
 
 TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
@@ -252,8 +254,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -275,7 +277,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 
 TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
@@ -312,8 +315,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
                                                   NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -336,7 +339,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-2, 1e-2);
 }
 
 }  // namespace test
diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h
index 8ec0039185366a5419cf2b56dfd9317b3a5342a3..313780cb3b9b39d568005ee84fa154390b13e827 100644
--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class FullyConnectedOp : public Operator<D, T> {
  public:
-  FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(kernels::StringToActivationType(
+  FullyConnectedOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context, kernels::StringToActivationType(
                      OperatorBase::GetOptionalArg<std::string>("activation",
                                                                "NOOP")),
                  OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -61,7 +61,8 @@ class FullyConnectedOp : public Operator<D, T> {
                  " don't match.");
     }
 
-    return functor_(input, weight, bias, output, future);
+    return functor_(input, weight,
+                    bias, output, future);
   }
 
  private:
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index 8b30096da8475217c48f05b85dde702c3e754edc..cdeba2439a94e5987c9844c4482f57af78dbb14c 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -76,7 +76,7 @@ void Simple(const std::vector<index_t> &input_shape,
   }
 
   // Check
-  auto expected = CreateTensor<float>(output_shape, output_value);
+  auto expected = net.CreateTensor<float>(output_shape, output_value);
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -156,8 +156,8 @@ void Random(const index_t batch,
   net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
 
   // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
@@ -181,10 +181,10 @@ void Random(const index_t batch,
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-1,
                             1e-1);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-3);
   }
 }
diff --git a/mace/ops/gather.h b/mace/ops/gather.h
index 37689b30a2765ee199db58f094a10f5513da8de6..fe4026d969835cc1dc456258194d40d7fb120584 100644
--- a/mace/ops/gather.h
+++ b/mace/ops/gather.h
@@ -24,9 +24,10 @@ namespace ops {
 template<DeviceType D, class T>
 class GatherOp : public Operator<D, T> {
  public:
-  GatherOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 0),
+  GatherOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("axis", 0),
                  OperatorBase::GetOptionalArg<float>("y", 1.0)) {}
 
   MaceStatus Run(StatsFuture *future) override {
diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc
index 3a35b3380ff8280f99b705fdf72b59d7a89ca77d..07a8438c515c88a9ae2631f79e52f27d45bfe237 100644
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
@@ -47,7 +47,7 @@ void TestGather(const std::vector<index_t> &weight_shape,
   // Run
   net.RunOp(CPU);
 
-  auto expected = CreateTensor<float>(output_shape, output);
+  auto expected = net.CreateTensor<float>(output_shape, output);
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
diff --git a/mace/ops/identity.h b/mace/ops/identity.h
index 7140314cc25fe5ce809f577de9a4e4ed9bd8ec1c..be4d75bf48d2c92281fe70d4014fc5b0f5b063fa 100644
--- a/mace/ops/identity.h
+++ b/mace/ops/identity.h
@@ -25,8 +25,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class IdentityOp : public Operator<D, T> {
  public:
-  IdentityOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+  IdentityOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h
index c1b9b0b8a9d5af2b4ad79b7d6b2206db5b3677d8..fc259a01b9c2d7c5ac01cc05762bbe1d12abe2b5 100644
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class ImageToBufferOp : public Operator<D, T> {
  public:
-  ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
+  ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/infer_conv2d_shape.h b/mace/ops/infer_conv2d_shape.h
index bc6163c170524800a5e0bbe5d83b7c419aeb123b..a39f66b6dec6109909384592e9db9bb4cab601c8 100644
--- a/mace/ops/infer_conv2d_shape.h
+++ b/mace/ops/infer_conv2d_shape.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class InferConv2dShapeOp : public Operator<D, T> {
  public:
-  InferConv2dShapeOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+  InferConv2dShapeOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h
index d8ad1d3eac999e315b1d84899643952cbd9997a1..66265f19e0fcef441e7374072c17cdd525e47f71 100644
--- a/mace/ops/local_response_norm.h
+++ b/mace/ops/local_response_norm.h
@@ -24,8 +24,8 @@ namespace ops {
 template <DeviceType D, class T>
 class LocalResponseNormOp : public Operator<D, T> {
  public:
-  LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws), functor_() {
+  LocalResponseNormOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context), functor_(context) {
     depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5);
     bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f);
     alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f);
diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc
index dc12f28a32b157a89d45a4c91c22480664478917..6bb726ead5bf3f8fbe6173013d99557cbed03209 100644
--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -46,7 +46,7 @@ void Simple() {
   }
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 1, 2, 6},
       {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47});
 
diff --git a/mace/ops/lstmcell.h b/mace/ops/lstmcell.h
index 300794f2341261a0ea13d1be0dffc48a3a6e1a78..3037c891ff5a9b7d9fb25096632556cce4193296 100644
--- a/mace/ops/lstmcell.h
+++ b/mace/ops/lstmcell.h
@@ -26,10 +26,12 @@ namespace ops {
 template <DeviceType D, class T>
 class LSTMCellOp : public Operator<D, T> {
  public:
-  LSTMCellOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(static_cast<T>(
-              OperatorBase::GetOptionalArg<float>("scalar_input", 0.0))) {}
+  LSTMCellOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 static_cast<T>(
+                     OperatorBase::GetOptionalArg<float>("scalar_input",
+                                                         0.0))) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h
index e5e0dafaafdf547817727dad8079373858406dc6..ceccb9398aaa7d5b730951672c0370e5509e1f7f 100644
--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
@@ -24,8 +24,9 @@ namespace ops {
 template <DeviceType D, class T>
 class MatMulOp : public Operator<D, T> {
  public:
-  MatMulOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
+  MatMulOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context),
         transpose_a_(OperatorBase::GetOptionalArg<bool>("transpose_a", false)),
         transpose_b_(OperatorBase::GetOptionalArg<bool>("transpose_b", false)) {
   }
@@ -46,7 +47,8 @@ class MatMulOp : public Operator<D, T> {
     MACE_CHECK(ak == bk, "the number of A's column ", ak,
                " must be equal to B's row ", bk);
 
-    return functor_(A, B, C, transpose_a_, transpose_b_, future);
+    return functor_(A, B, C,
+                    transpose_a_, transpose_b_, future);
   }
 
  private:
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index 18a9ddc88e7b439b68404696f9082ca788eb68c6..9225b2269d5f37f36c412b26695ab07f36788b69 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -65,7 +65,7 @@ void Simple(const std::vector<index_t> &A_shape,
   }
 
   // Check
-  auto expected = CreateTensor<float>(C_shape, C_value);
+  auto expected = net.CreateTensor<float>(C_shape, C_value);
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -171,15 +171,15 @@ void Complex(const std::vector<index_t> &batch,
   // Check
   EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape());
 
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
-  expected.Reshape({batch_count, height, out_width});
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
+  expected->Reshape({batch_count, height, out_width});
 
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-1);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5,
                             1e-5);
   }
 }
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5be4cb9696978de91def270ae880df203849fcd4
--- /dev/null
+++ b/mace/ops/ops_test_util.cc
@@ -0,0 +1,44 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+OpTestContext *OpTestContext::Get() {
+  static OpTestContext instance;
+  return &instance;
+}
+
+std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
+  return gpu_context_;
+}
+
+Device *OpTestContext::GetDevice(DeviceType device_type) {
+  return device_map_[device_type].get();
+}
+
+OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) {
+  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(new CPUDevice(-1));
+  device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
+      new GPUDevice(gpu_context_->opencl_tuner(),
+                    gpu_context_->opencl_cache_storage(),
+                    GPUPriorityHint::PRIORITY_NORMAL));
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 2dc29241a73bfc69ee681d01bf15a6b8d928f0f3..278c3515f575c2c72eaa9f9a9908db491fc0c3cd 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -17,6 +17,7 @@
 
 #include <functional>
 #include <limits>
+#include <map>
 #include <memory>
 #include <random>
 #include <string>
@@ -26,7 +27,8 @@
 
 #include "gtest/gtest.h"
 #include "mace/core/net.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/core/device_context.h"
+#include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/kernels/opencl/common.h"
@@ -110,9 +112,28 @@ class OpDefBuilder {
   OperatorDef op_def_;
 };
 
+class OpTestContext {
+ public:
+  static OpTestContext *Get();
+  std::shared_ptr<GPUContext> gpu_context() const;
+  Device *GetDevice(DeviceType device_type);
+ private:
+  OpTestContext();
+  MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
+
+  std::shared_ptr<GPUContext> gpu_context_;
+  std::map<DeviceType, std::unique_ptr<Device>> device_map_;
+};
+
 class OpsTestNet {
  public:
-  OpsTestNet() : op_registry_(new OperatorRegistry()) {}
+  OpsTestNet() :
+    op_registry_(new OperatorRegistry()) {
+  }
+
+  ~OpsTestNet() {
+    Sync();
+  }
 
   template <DeviceType D, typename T>
   void AddInputFromArray(const std::string &name,
@@ -121,7 +142,8 @@ class OpsTestNet {
                          const float scale = 0.0,
                          const int32_t zero_point = 0) {
     Tensor *input =
-        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
+                         DataTypeToEnum<T>::v());
     input->Resize(shape);
     Tensor::MappingGuard input_mapper(input);
     T *input_data = input->mutable_data<T>();
@@ -136,7 +158,8 @@ class OpsTestNet {
                         const std::vector<index_t> &shape,
                         const T data) {
     Tensor *input =
-        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
+                         DataTypeToEnum<T>::v());
     input->Resize(shape);
     Tensor::MappingGuard input_mapper(input);
     T *input_data = input->mutable_data<T>();
@@ -149,7 +172,8 @@ class OpsTestNet {
                       bool positive = true,
                       bool truncate = false) {
     Tensor *input =
-        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
+                         DataTypeToEnum<T>::v());
     input->Resize(shape);
     Tensor::MappingGuard input_mapper(input);
     T *input_data = input->mutable_data<T>();
@@ -184,8 +208,10 @@ class OpsTestNet {
   template <DeviceType D, typename T>
   void Transpose2D(const std::string &src_name, const std::string &dst_name) {
     Tensor *input = ws_.GetTensor(src_name);
-    Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
-                                      DataTypeToEnum<T>::v());
+    Tensor *output = ws_.CreateTensor(
+        dst_name,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
     const std::vector<index_t> input_shape = input->shape();
     MACE_CHECK(input_shape.size() == 2, "input shape != 2");
     output->Resize({input_shape[1], input_shape[0]});
@@ -205,8 +231,10 @@ class OpsTestNet {
   void CopyData(const std::string &src_name,
                 const std::string &dst_name) {
     Tensor *input = ws_.GetTensor(src_name);
-    Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
-                                      DataTypeToEnum<T>::v());
+    Tensor *output = ws_.CreateTensor(
+        dst_name,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
 
     const std::vector<index_t> input_shape = input->shape();
     output->Resize(input_shape);
@@ -222,8 +250,10 @@ class OpsTestNet {
                            const std::string &dst_name,
                            const DataFormat dst_format) {
     Tensor *input = ws_.GetTensor(src_name);
-    Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
-                                      DataTypeToEnum<T>::v());
+    Tensor *output = ws_.CreateTensor(
+        dst_name,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
     const std::vector<index_t> input_shape = input->shape();
     MACE_CHECK(input_shape.size() == 4, "input shape != 4");
 
@@ -352,8 +382,10 @@ class OpsTestNet {
   void FillNHWCInputToNCHWInput(const std::string &name_nchw,
                                 const std::string &name_nhwc) {
     Tensor *input = ws_.GetTensor(name_nhwc);
-    Tensor *output = ws_.CreateTensor(name_nchw, GetDeviceAllocator(D),
-                                      DataTypeToEnum<T>::v());
+    Tensor *output = ws_.CreateTensor(
+        name_nchw,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
     const std::vector<index_t> input_shape = input->shape();
     index_t batch = input_shape[0];
     index_t height = input_shape[1];
@@ -374,6 +406,22 @@ class OpsTestNet {
     }
   }
 
+  // Create standalone tensor on device D with T type.
+  template <typename T, DeviceType D = DeviceType::CPU>
+  std::unique_ptr<Tensor> CreateTensor(
+      const std::vector<index_t> &shape = {},
+      const std::vector<T> &data = {}) {
+    std::unique_ptr<Tensor> res(
+        new Tensor(OpTestContext::Get()->GetDevice(D)->allocator(),
+                   DataTypeToEnum<T>::v()));
+    if (!data.empty()) {
+      res->Resize(shape);
+      T *input_data = res->mutable_data<T>();
+      memcpy(input_data, data.data(), data.size() * sizeof(T));
+    }
+    return res;
+  }
+
   OperatorDef *NewOperatorDef() {
     op_defs_.clear();
     op_defs_.emplace_back(OperatorDef());
@@ -392,8 +440,9 @@ class OpsTestNet {
     for (auto &op_def_ : op_defs_) {
       net_def.add_op()->CopyFrom(op_def_);
     }
-    net_ = CreateNet(op_registry_, net_def, &ws_, device);
-    device_ = device;
+    net_ = CreateNet(op_registry_, net_def, &ws_,
+                     OpTestContext::Get()->GetDevice(device));
+    device_type_ = device;
     return net_ != nullptr;
   }
 
@@ -416,10 +465,15 @@ class OpsTestNet {
   MaceStatus RunOp() { return RunOp(DeviceType::CPU); }
 
   MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
-    device_ = device;
-    net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT);
+    device_type_ = device;
+    net_ = CreateNet(op_registry_,
+                     net_def,
+                     &ws_,
+                     OpTestContext::Get()->GetDevice(device),
+                     NetMode::INIT);
     MACE_RETURN_IF_ERROR(net_->Run());
-    net_ = CreateNet(op_registry_, net_def, &ws_, device);
+    net_ = CreateNet(op_registry_, net_def, &ws_,
+                     OpTestContext::Get()->GetDevice(device));
     return net_->Run();
   }
 
@@ -432,9 +486,12 @@ class OpsTestNet {
   }
 
   void Sync() {
-    if (net_ && device_ == DeviceType::GPU) {
-      OpenCLRuntime::Global()->command_queue().finish();
+#ifdef MACE_ENABLE_OPENCL
+    if (net_ && device_type_ == DeviceType::GPU) {
+      OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime()
+          ->command_queue().finish();
     }
+#endif
   }
 
  public:
@@ -442,17 +499,17 @@ class OpsTestNet {
   Workspace ws_;
   std::vector<OperatorDef> op_defs_;
   std::unique_ptr<NetBase> net_;
-  DeviceType device_;
+  DeviceType device_type_;
 };
 
 class OpsTestBase : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    // OpenCLRuntime::CreateGlobal();
+    SetOpenMPThreadsAndAffinityPolicy(-1,
+                                      CPUAffinityPolicy::AFFINITY_BIG_ONLY);
   }
 
   virtual void TearDown() {
-    // OpenCLRuntime::DestroyGlobal();
   }
 };
 
@@ -510,17 +567,6 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
   return std::move(dest);
 }
 
-template <typename T>
-std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
-                                     const std::vector<T> &data) {
-  std::unique_ptr<Tensor> res(
-      new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
-  res->Resize(shape);
-  T *input_data = res->mutable_data<T>();
-  memcpy(input_data, data.data(), data.size() * sizeof(T));
-  return res;
-}
-
 inline bool IsSameSize(const Tensor &x, const Tensor &y) {
   if (x.dim_size() != y.dim_size()) return false;
   for (int d = 0; d < x.dim_size(); ++d) {
diff --git a/mace/ops/pad.h b/mace/ops/pad.h
index 9867710917fd64983fbb8c006bda092baa0b04b0..6a7ce1027946497cb287618a9320b33887aafcdd 100644
--- a/mace/ops/pad.h
+++ b/mace/ops/pad.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, class T>
 class PadOp : public Operator<D, T> {
  public:
-  PadOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetRepeatedArgs<int>("paddings"),
+  PadOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("paddings"),
                  OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {}
 
   MaceStatus Run(StatsFuture *future) override {
diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc
index 2f4a97214a04ab1df8f78005cf2da6f82c819643..3a68248eb5dfc157b3c3111e910b2928fb9b6369 100644
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -63,7 +63,7 @@ void Simple() {
 
   auto output = net.GetTensor("Output");
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 5, 6, 1}, {
                         1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2,   2,   2,
                         1.0, 1.0, 1.0, 2,   2,   2,   1.0, 1.0, 1.0, 1.0,
@@ -99,7 +99,7 @@ TEST_F(PadTest, ComplexCPU) {
 
   auto output = net.GetTensor("Output");
 
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 3, 3, 4},
       {
           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
@@ -134,8 +134,8 @@ void Complex(const std::vector<index_t> &input_shape,
   net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
                                                   NHWC);
 
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                     kernels::BufferType::IN_OUT_CHANNEL);
@@ -155,9 +155,9 @@ void Complex(const std::vector<index_t> &input_shape,
   auto output = net.GetTensor("OpenCLOutput");
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(expected, *output, 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *output, 1e-2, 1e-2);
   } else {
-    ExpectTensorNear<float>(expected, *output, 1e-5);
+    ExpectTensorNear<float>(*expected, *output, 1e-5);
   }
 }
 }  // namespace
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
index fac4e1dd53b62c811aa40f2b7dfe7b96c1610213..3d1753b399489766da17a2245ef2dc4f92f8683d 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -27,13 +27,14 @@ namespace ops {
 template <DeviceType D, class T>
 class PoolingOp : public ConvPool2dOpBase<D, T> {
  public:
-  PoolingOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
+  PoolingOp(const OperatorDef &op_def, OpKernelContext *context)
+      : ConvPool2dOpBase<D, T>(op_def, context),
         kernels_(OperatorBase::GetRepeatedArgs<int>("kernels")),
         pooling_type_(
             static_cast<PoolingType>(OperatorBase::GetOptionalArg<int>(
                 "pooling_type", static_cast<int>(AVG)))),
-        functor_(pooling_type_,
+        functor_(context,
+                 pooling_type_,
                  kernels_.data(),
                  this->strides_.data(),
                  this->padding_type_,
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index 72a4fdeef86077ff8633a98a14bca24642cfed0e..2f02d729ed45aa6a160af5d42d09bcc915650481 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -57,7 +57,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {
 
   // Check
   auto expected =
-      CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
+      net.CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -90,7 +90,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
+  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -124,7 +124,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
+  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -158,7 +158,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
+  auto expected = net.CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -209,7 +209,7 @@ void SimpleMaxPooling3S2() {
   }
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
+  auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -249,8 +249,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                   NHWC);
 
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
                       kernels::BufferType::IN_OUT_CHANNEL);
@@ -269,10 +269,10 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
                           kernels::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
                             1e-4);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
   }
 }
 }  // namespace
@@ -334,7 +334,7 @@ TEST_F(PoolingOpTest, AVG_VALID) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -368,7 +368,7 @@ void SimpleAvgPoolingTest() {
                           kernels::BufferType::IN_OUT_CHANNEL);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
+  auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -407,8 +407,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                   NHWC);
 
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
                       kernels::BufferType::IN_OUT_CHANNEL);
@@ -427,10 +427,10 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
                           kernels::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
                             1e-3);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
   }
 }
 }  // namespace
@@ -503,7 +503,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_VALID) {
 
   // Check
   auto expected =
-      CreateTensor<uint8_t>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
+      net.CreateTensor<uint8_t>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
 
   ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -531,7 +531,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_SAME) {
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<uint8_t>({1, 2, 2, 1}, {4, 5, 7, 8});
+  auto expected = net.CreateTensor<uint8_t>({1, 2, 2, 1}, {4, 5, 7, 8});
 
   ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -561,7 +561,7 @@ TEST_F(PoolingOpTest, QUANT_AVG_VALID) {
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<uint8_t>(
+  auto expected = net.CreateTensor<uint8_t>(
       {1, 2, 2, 2}, {3, 19, 5, 21, 11, 27, 13, 29});
 
   ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h
index 1afabb8fe36800a4e09af30d0e14dd9586256376..d879e240ca200d5fbd09212a7e0ecde68314c47e 100644
--- a/mace/ops/proposal.h
+++ b/mace/ops/proposal.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ProposalOp : public Operator<D, T> {
  public:
-  ProposalOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("min_size", 16),
+  ProposalOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("min_size", 16),
                  OperatorBase::GetOptionalArg<float>("nms_thresh", 0.7),
                  OperatorBase::GetOptionalArg<int>("pre_nms_top_n", 6000),
                  OperatorBase::GetOptionalArg<int>("post_nms_top_n", 300),
diff --git a/mace/ops/proposal_test.cc b/mace/ops/proposal_test.cc
index c5b71ad24d892fefb09eb3999b1d92d27113ff62..e8b2ae5aad79dbab8f08e89006a7e38ff40360d0 100644
--- a/mace/ops/proposal_test.cc
+++ b/mace/ops/proposal_test.cc
@@ -60,7 +60,8 @@ TEST_F(ProposalOpTest, CPUSimple) {
   // Run
   net.RunOp();
 
-  auto expected_tensor = CreateTensor<float>({1, 1, 1, 5}, {0, 0, 0, 255, 255});
+  auto expected_tensor = net.CreateTensor<float>({1, 1, 1, 5},
+                                                 {0, 0, 0, 255, 255});
 
   ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5);
 }
diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h
index eb78489bb53a70c9321bcf37bd1abb6c8543b5ac..2e7a77c2c624e5cc551898bc0b6d971eba580b1a 100644
--- a/mace/ops/quantize.h
+++ b/mace/ops/quantize.h
@@ -24,8 +24,9 @@ namespace ops {
 template<DeviceType D, class T>
 class QuantizeOp : public Operator<D, T> {
  public:
-  QuantizeOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
+  QuantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context),
         non_zero_(
             static_cast<bool>(OperatorBase::GetOptionalArg<int>("non_zero",
                                                                 0))) {}
@@ -50,8 +51,8 @@ class QuantizeOp : public Operator<D, T> {
 template<DeviceType D, class T>
 class DequantizeOp : public Operator<D, T> {
  public:
-  DequantizeOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {}
+  DequantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context), functor_(context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/reduce_mean.h b/mace/ops/reduce_mean.h
index 7cdaff86fbd6714417678c27d36357a3d9cde4e3..0ef9c10274abbb28b6fb86bba2591e28ab0e38d2 100644
--- a/mace/ops/reduce_mean.h
+++ b/mace/ops/reduce_mean.h
@@ -27,9 +27,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ReduceMeanOp : public Operator<D, T> {
  public:
-  ReduceMeanOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetRepeatedArgs<int>("axis"),
+  ReduceMeanOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("axis"),
                  OperatorBase::GetOptionalArg<bool>("keepdims", false)) {}
 
   MaceStatus Run(StatsFuture *future) override {
diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc
index 4f5a029e836aee7671256c790850f7c6044e11a1..2b1875ded8fb030234f818b0061067099d8ed467 100644
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
@@ -57,7 +57,7 @@ void Simple(const std::vector<index_t> &input_shape,
     ImageToBuffer<D, float>(&net, "OutputImg", "Output",
                             kernels::BufferType::IN_OUT_CHANNEL);
   }
-  auto expected = CreateTensor<float>(output_shape, output);
+  auto expected = net.CreateTensor<float>(output_shape, output);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
 }
 
diff --git a/mace/ops/reshape.h b/mace/ops/reshape.h
index c47e6cb1791e2fbd3e1fa1aa0506d9189f6dd0f1..86476de06bb5cb65e55bc623218fb7f97f1e3819 100644
--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class ReshapeOp : public Operator<D, T> {
  public:
-  ReshapeOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+  ReshapeOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context), functor_(context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/resize_bicubic.h b/mace/ops/resize_bicubic.h
index a83f3a310afc02ca3abd474b4481e16470f28953..23b4c116b660ae814e9c8085a7cbf90712861c02 100644
--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ResizeBicubicOp : public Operator<D, T> {
  public:
-  ResizeBicubicOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
+  ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
                  OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
 
   MaceStatus Run(StatsFuture *future) override {
diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc
index 7c7bd8bc263dd579fc3576a278550a894f97a7d3..97da04804395fdbe13e1fef70ca619ce4f06c771 100644
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -48,7 +48,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
@@ -77,7 +77,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 2, 3, 3},
+  auto expected = net.CreateTensor<float>({1, 2, 3, 3},
       {0., 1., 2., 4.110297, 5.110297, 6.110297,
        8.223037, 9.223036, 10.223037, 24., 25., 26.,
        28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038});
@@ -110,7 +110,7 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
diff --git a/mace/ops/resize_bilinear.h b/mace/ops/resize_bilinear.h
index fb3898599ef706c6dc160158a074cb2ff663d986..f328a9a45e152b162ea0b7e978d078b0d5dbac29 100644
--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ResizeBilinearOp : public Operator<D, T> {
  public:
-  ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
+  ResizeBilinearOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
                  OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
 
   MaceStatus Run(StatsFuture *future) override {
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index 49dda888ee3ebceab58ec9f82830cef05d8d3ebe..5d284f867a88c4acedbeb9293372dba7b9e1ea9d 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -48,7 +48,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -78,7 +78,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
                                                   NHWC);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -116,8 +116,8 @@ void TestRandomResizeBilinear() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
 
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
 
     if (D == DeviceType::GPU) {
       BufferToImage<D, float>(&net, "Input", "InputImage",
@@ -136,7 +136,7 @@ void TestRandomResizeBilinear() {
                               kernels::BufferType::IN_OUT_CHANNEL);
     }
     // Check
-    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
                             1e-6);
   }
 }
diff --git a/mace/ops/scalar_math.h b/mace/ops/scalar_math.h
index 29cb478c718f0d7eef1a8c1e18c61550ca9f2cee..356c93719894353a35459371b9f04d5f821a540a 100644
--- a/mace/ops/scalar_math.h
+++ b/mace/ops/scalar_math.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class ScalarMathOp : public Operator<D, T> {
  public:
-  ScalarMathOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(static_cast<kernels::EltwiseType>(
+  ScalarMathOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 static_cast<kernels::EltwiseType>(
                    OperatorBase::GetOptionalArg<int>(
                        "type", static_cast<int>(kernels::EltwiseType::NONE))),
                  OperatorBase::GetRepeatedArgs<float>("coeff"),
diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc
index 32b9db0001f4c9edb5639e90683bb5ac49a3449d..0d34b80abb16cf4e7f6126f2d74e9c5ce8770fe0 100644
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
@@ -49,60 +49,60 @@ void ScalarMathTest(const kernels::EltwiseType type,
   net.RunOp(D);
 
 
-  auto expected = CreateTensor<DstType>({}, {output});
+  auto expected = net.CreateTensor<DstType>({}, {output});
 
   ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
 
 TEST_F(ScalarMathOpTest, SimpleCPU) {
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::SUM, 1, 2, 3, 3);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::SUB, 1, 2, 3, -1);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::PROD, 3, -2, 3, -6);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::MIN, 3, -2, 1, -2);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::MAX, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::NEG, 3, -2, 1, -3);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::ABS, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
-ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::POW, 3, 1, 1, 3);
-ScalarMathTest<DeviceType::CPU, float, int32_t>(
-    kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SUM, 1, 2, 3, 3);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SUB, 1, 2, 3, -1);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::PROD, 3, -2, 3, -6);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::MIN, 3, -2, 1, -2);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::MAX, 3, -2, 1, 3);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::NEG, 3, -2, 1, -3);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::ABS, 3, -2, 1, 3);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+  ScalarMathTest<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::POW, 3, 1, 1, 3);
+  ScalarMathTest<DeviceType::CPU, float, int32_t>(
+      kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
 }
 
 TEST_F(ScalarMathOpTest, SimpleGPU) {
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::SUM, 1, 2, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::SUB, 1, 2, 1, -1);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::PROD, 3, -2, 1, -6);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::MIN, 3, -2, 1, -2);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::MAX, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::NEG, 3, -2, 1, -3);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::ABS, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
-ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::POW, 3, 1, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, int32_t>(
-    kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::SUM, 1, 2, 1, 3);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::SUB, 1, 2, 1, -1);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::PROD, 3, -2, 1, -6);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::MIN, 3, -2, 1, -2);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::MAX, 3, -2, 1, 3);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::NEG, 3, -2, 1, -3);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::ABS, 3, -2, 1, 3);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+  ScalarMathTest<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::POW, 3, 1, 1, 3);
+  ScalarMathTest<DeviceType::GPU, float, int32_t>(
+      kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
 }
 }  // namespace test
 }  // namespace ops
diff --git a/mace/ops/shape.h b/mace/ops/shape.h
index 98f139e44877756875cd8f0d7ee6335b35ae75bc..abb9ffb3197bf53c46881e53bc01c3f4c072bae3 100644
--- a/mace/ops/shape.h
+++ b/mace/ops/shape.h
@@ -25,8 +25,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class ShapeOp : public Operator<D, T> {
  public:
-  ShapeOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+  ShapeOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h
index 0a6868f05e2393aca3fccfe0fd535964c079c194..047402f0c0c5bf45f25ff58405359013e6ce0fa4 100644
--- a/mace/ops/softmax.h
+++ b/mace/ops/softmax.h
@@ -24,8 +24,9 @@ namespace ops {
 template <DeviceType D, class T>
 class SoftmaxOp : public Operator<D, T> {
  public:
-  SoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {}
+  SoftmaxOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *logits = this->Input(LOGITS);
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index 827067f4ce093b42539cc388fefb13ffa691b905..012424c5b5d3deeed00fc73beb05b02063cd3374 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -29,7 +29,7 @@ void Simple() {
   // Add input data
   net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
                                   {1, 1, 1, 1, 1, 2, 3, 4});
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
       {1, 1, 2, 4},
       {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
 
@@ -113,8 +113,8 @@ void Complex(const std::vector<index_t> &logits_shape) {
     net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
   }
 
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<D, float>(&net, "Input", "InputImage",
                           kernels::BufferType::IN_OUT_CHANNEL);
@@ -131,7 +131,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
   ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                           kernels::BufferType::IN_OUT_CHANNEL);
 
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 }  // namespace
 
diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h
index 7ce0dd135fcd8c43844db52740e993ba8aafd6ab..170bde09b0876edb370f7873f3f9fa09e55d67ce 100644
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -27,9 +27,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class SpaceToBatchNDOp : public Operator<D, T> {
  public:
-  SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
+  SpaceToBatchNDOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
                  OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
                  false) {}
 
diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc
index 5539bfd628a5f15c1a8511b47ebf3d8f5ff322af..8a3c35feff500ccf180b23de814ca5c89569c74b 100644
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -116,24 +116,23 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
                                 const std::vector<int> &padding_data,
                                 const std::vector<index_t> &batch_shape,
                                 const std::vector<float> &batch_data) {
-  auto space_tensor = std::unique_ptr<Tensor>(
-      new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
+  OpsTestNet net;
+  auto space_tensor = net.CreateTensor<T, GPU>();
   space_tensor->Resize(space_shape);
   {
     Tensor::MappingGuard space_mapper(space_tensor.get());
-    T *space_ptr = space_tensor->mutable_data<T>();
+    T *space_ptr = space_tensor->template mutable_data<T>();
     MACE_CHECK(static_cast<size_t>(space_tensor->size()) == space_data.size())
         << "Space tensor size:" << space_tensor->size()
         << ", space data size:" << space_data.size();
     memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T));
   }
 
-  auto batch_tensor = std::unique_ptr<Tensor>(
-      new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
+  auto batch_tensor = net.CreateTensor<T, GPU>();
   batch_tensor->Resize(batch_shape);
   {
     Tensor::MappingGuard batch_mapper(batch_tensor.get());
-    T *batch_ptr = batch_tensor->mutable_data<T>();
+    T *batch_ptr = batch_tensor->template mutable_data<T>();
     MACE_CHECK(static_cast<size_t>(batch_tensor->size()) == batch_data.size());
     memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T));
   }
diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h
index 44ca7e5c5b72a69cb8bab5f1f665b71bd64ede35..75dd27ed04a4a49a85a7e6c8d760bc0a76c1928b 100644
--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
@@ -27,9 +27,11 @@ namespace ops {
 template <DeviceType D, typename T>
 class SpaceToDepthOp : public Operator<D, T> {
  public:
-  SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("block_size", 1), false) {}
+  SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("block_size", 1),
+                 false) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/split.h b/mace/ops/split.h
index 710cdfb343de578c59830022b5e702e5ee99dd18..aa41aa15c6bb6a2f181d514b916859c252aeffb1 100644
--- a/mace/ops/split.h
+++ b/mace/ops/split.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, typename T>
 class SplitOp : public Operator<D, T> {
  public:
-  SplitOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 3)) {}
+  SplitOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
 
   MaceStatus Run(StatsFuture *future) override {
     MACE_CHECK(this->OutputSize() >= 2)
diff --git a/mace/ops/squeeze.h b/mace/ops/squeeze.h
index 35b2aed4c2585f5bc85c427962270d9e35baf973..7febfb0e20b377c54493623910c64f18228da487 100644
--- a/mace/ops/squeeze.h
+++ b/mace/ops/squeeze.h
@@ -26,8 +26,8 @@ namespace ops {
 template<DeviceType D, typename T>
 class SqueezeOp : public Operator<D, T> {
  public:
-  SqueezeOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
+  SqueezeOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
         axis_(OperatorBase::GetRepeatedArgs<int>("axis", {})) {}
 
   MaceStatus Run(StatsFuture *future) override {
diff --git a/mace/ops/stack.h b/mace/ops/stack.h
index 17210fb29259cfbdf52b91840424863c0c3c62c4..be25c0b079cf014eb171c2b4f311e038ac256892 100644
--- a/mace/ops/stack.h
+++ b/mace/ops/stack.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class StackOp : public Operator<D, T> {
  public:
-  StackOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 0)) {}
+  StackOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const std::vector<const Tensor *> &inputs = this->Inputs();
diff --git a/mace/ops/strided_slice.h b/mace/ops/strided_slice.h
index 57653359c2b0d4333ed8e04517c699e60b7439b3..249dc3e9d07b7b59665faedc10cb7c320f1c9aea 100644
--- a/mace/ops/strided_slice.h
+++ b/mace/ops/strided_slice.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class StridedSliceOp : public Operator<D, T> {
  public:
-  StridedSliceOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("begin_mask", 0),
+  StridedSliceOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("begin_mask", 0),
                  OperatorBase::GetOptionalArg<int>("end_mask", 0),
                  OperatorBase::GetOptionalArg<int>("ellipsis_mask", 0),
                  OperatorBase::GetOptionalArg<int>("new_axis_mask", 0),
diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h
index 1ad73db91ede576ddde3406648d41b61fd630e4b..91aa3365a3606b3f8899e4ca07141fba7011fc7d 100644
--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
@@ -26,10 +26,10 @@ namespace mace {
 template <DeviceType D, class T>
 class TransposeOp : public Operator<D, T> {
  public:
-  TransposeOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
+  TransposeOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
         dims_(OperatorBase::GetRepeatedArgs<int>("dims")),
-        functor_(dims_) {}
+        functor_(context, dims_) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/unstack.h b/mace/ops/unstack.h
index 1f743bd5974e99f758b8922506f0588c81b419ff..1c3d1764972f6f8dc40e7353a2445e1e0ee6421d 100644
--- a/mace/ops/unstack.h
+++ b/mace/ops/unstack.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class UnstackOp : public Operator<D, T> {
  public:
-  UnstackOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 0)) {}
+  UnstackOp(const OperatorDef &operator_def, OpKernelContext *context)
+      : Operator<D, T>(operator_def, context),
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
index 2406a3614a3acb49788c2bc2ac72338e068b0a1a..3cd5ab92b7a5aa0def56ed83bb58847042b2fc20 100644
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -64,9 +64,10 @@ void WinogradConvolution(const index_t batch,
   // Transfer output
   ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
                           kernels::BufferType::IN_OUT_CHANNEL);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("ConvOutput"));
-  auto output_shape = expected.shape();
+
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("ConvOutput"));
+  auto output_shape = expected->shape();
 
   // Winograd convolution
   // transform filter
@@ -124,9 +125,11 @@ void WinogradConvolution(const index_t batch,
   ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
                           kernels::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-2, 1e-2);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-5, 1e-4);
   }
 }
 }  // namespace
@@ -212,9 +215,9 @@ void WinogradConvolutionWithPad(const index_t batch,
   // Transfer output
   ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
                           kernels::BufferType::IN_OUT_CHANNEL);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("ConvOutput"));
-  auto output_shape = expected.shape();
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("ConvOutput"));
+  auto output_shape = expected->shape();
 
   // Winograd convolution
   // transform filter
@@ -272,9 +275,11 @@ void WinogradConvolutionWithPad(const index_t batch,
   ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
                           kernels::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-2, 1e-2);
   } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-5, 1e-4);
   }
 }
 }  // namespace
diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h
index 0349de8ace51322cdc715c9bc81ee3c4ec21b2bb..548c889a2538b147eae895f24f7b844de5fc6e1c 100644
--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -29,9 +29,11 @@ namespace ops {
 template <DeviceType D, typename T>
 class WinogradInverseTransformOp : public Operator<D, T> {
  public:
-  WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(kernels::StringToActivationType(
+  WinogradInverseTransformOp(const OperatorDef &op_def,
+                             OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 kernels::StringToActivationType(
                      OperatorBase::GetOptionalArg<std::string>("activation",
                                                                "NOOP")),
                  OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h
index db874287a4dae9b09cee727516789b52e3349399..2274b6e8a8c29aa0a4d46cda6a344206055aa0fa 100644
--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class WinogradTransformOp : public Operator<D, T> {
  public:
-  WinogradTransformOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
+  WinogradTransformOp(const OperatorDef &op_def, OpKernelContext *context)
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
                      "padding", static_cast<int>(VALID))),
                  OperatorBase::GetRepeatedArgs<int>("padding_values"),
                  OperatorBase::GetOptionalArg<int>(
diff --git a/mace/public/BUILD b/mace/public/BUILD
index 3669d59518f3b89484626d1f023195f58395b924..b434312bcfdd4ec65a78bfc879a2dfcb41cc129c 100644
--- a/mace/public/BUILD
+++ b/mace/public/BUILD
@@ -11,7 +11,6 @@ cc_library(
     name = "public",
     hdrs = [
         "mace.h",
-        "mace_runtime.h",
     ],
     copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
 )
diff --git a/mace/public/mace.h b/mace/public/mace.h
index f6116348f7f5874021271fd04feb680c615df4c7..0b743423b3557ed2c5687334a96c0285e4d125d7 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -24,12 +24,36 @@
 #include <string>
 #include <vector>
 
+#ifndef MACE_API
+#define MACE_API __attribute__((visibility("default")))
+#endif
+
 namespace mace {
 
 class NetDef;
 
 enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
 
+enum GPUPerfHint {
+  PERF_DEFAULT = 0,
+  PERF_LOW = 1,
+  PERF_NORMAL = 2,
+  PERF_HIGH = 3
+};
+
+enum GPUPriorityHint {
+  PRIORITY_DEFAULT = 0,
+  PRIORITY_LOW = 1,
+  PRIORITY_NORMAL = 2,
+  PRIORITY_HIGH = 3
+};
+
+enum CPUAffinityPolicy {
+  AFFINITY_NONE = 0,
+  AFFINITY_BIG_ONLY = 1,
+  AFFINITY_LITTLE_ONLY = 2,
+};
+
 struct CallStats {
   int64_t start_micros;
   int64_t end_micros;
@@ -73,14 +97,167 @@ enum MaceStatus {
     }                                                                      \
   }
 
+/// \brief Get ARM big.LITTLE configuration.
+///
+/// This function will detect the max frequencies of all CPU cores, and assume
+/// the cores with largest max frequencies as big cores, and all the remaining
+/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
+/// little_core_ids will both be filled with all cpu core ids.
+///
+/// \param [out] big_core_ids
+/// \param [out] little_core_ids
+/// \return If successful, it returns MACE_SUCCESS and error if it can't
+///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
+
+MACE_API MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
+                                        std::vector<int> *little_core_ids);
+
+/// \brief GPU context contain the status used for GPU device.
+///
+/// The life cycle of GPUContext object is the same as MaceEngines use it.
+/// Just use one GPUContext for all MaceEngines, which will speed up the
+/// initialization procedure. There are some data in common between different
+/// MaceEngines using GPU, use one GPUContext could avoid duplication.
+class GPUContext;
+
+/// \brief GPUContext builder.
+///
+/// Use the GPUContextBuilder to generate GPUContext.
+/// Not thread-safe
+class MACE_API GPUContextBuilder {
+ public:
+  GPUContextBuilder();
+  ~GPUContextBuilder();
+  GPUContextBuilder(const GPUContextBuilder &) = delete;
+  GPUContextBuilder(const GPUContextBuilder &&) = delete;
+  GPUContextBuilder &operator=(const GPUContextBuilder &) = delete;
+  GPUContextBuilder &operator=(const GPUContextBuilder &&) = delete;
+
+  /// \brief Set internal storage factory to store internal data.
+  ///
+  /// Now the path is used to store the built OpenCL binaries to file,
+  /// which could speed up the GPU initialization and first run.
+  /// If do not call this API, the initialization maybe slow for GPU.
+  ///
+  /// \param path  Make sure your program have Read/Write permission of the path
+  /// \return
+  GPUContextBuilder &SetStoragePath(const std::string &path);
+  /// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)  // NOLINT(whitespace/line_length)
+  ///
+  /// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization.  // NOLINT(whitespace/line_length)
+  /// OpenCL binary is corresponding to the OpenCL Driver version,
+  /// you should update the binary when OpenCL Driver changed.
+  ///
+  /// \param paths MACE will use first file found in all paths
+  /// \return
+  GPUContextBuilder &SetOpenCLBinaryPaths(
+      const std::vector<std::string> &paths);
+  /// \brief Set the path of Generated OpenCL parameter file
+  ///
+  /// If you use gpu for specific soc, The parameters is the local work group
+  /// size tuned for specific SOC, which may be faster than the
+  /// general parameters.
+  ///
+  /// \param path Make sure your program have Read/Write permission of the path
+  /// \return
+  GPUContextBuilder &SetOpenCLParameterPath(const std::string &path);
+
+  std::shared_ptr<GPUContext> Finalize();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+class MACE_API MaceEngineConfig {
+ public:
+  explicit MaceEngineConfig(const DeviceType device_type);
+  ~MaceEngineConfig();
+  MaceEngineConfig(const MaceEngineConfig &) = delete;
+  MaceEngineConfig(const MaceEngineConfig &&) = delete;
+  MaceEngineConfig &operator=(const MaceEngineConfig &) = delete;
+  MaceEngineConfig &operator=(const MaceEngineConfig &&) = delete;
+
+  /// \brief Set GPUContext
+  ///
+  /// Just use one GPUContext for multiple models run on GPU.
+  /// \param context created use GPUContextBuilder
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetGPUContext(std::shared_ptr<GPUContext> context);
+
+  /// \brief Set GPU hints, currently only supports Adreno GPU.
+  ///
+  /// Caution: this function may hurt performance
+  /// if improper parameters provided.
+  ///
+  /// \param perf_hint  performance hint
+  /// \param priority_hint  priority hint
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetGPUHints(GPUPerfHint perf_hint,
+                         GPUPriorityHint priority_hint);
+
+  /// \brief Set CPU threads number and affinity policy.
+  ///
+  /// Caution: this function may hurt performance if improper
+  /// parameters provided. When num_threads_hint is zero or negative,
+  /// the function will set the threads number equaling to the number of
+  /// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
+  /// (AFFINITY_NONE) cores according to the policy. The threads number will
+  /// also be truncated to the corresponding cores number when num_threads_hint
+  /// is larger than it.
+  /// The OpenMP threads will be bind to (via sched_setaffinity) big cores
+  /// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
+  ///
+  /// \param num_threads_hint it is only a hint.
+  /// \param policy one of CPUAffinityPolicy
+  /// \param status MACE_SUCCESS for successful, or it can't reliabley
+  /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
+  /// suggested to use AFFINITY_NONE to use all cores.
+  /// \param use_gemmlowp use gemmlowp for quantized inference
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetCPUThreadPolicy(int num_threads_hint,
+                                CPUAffinityPolicy policy,
+                                bool use_gemmlowp = false);
+
+  /// \brief Set OpenMP threads number and processor affinity.
+  ///
+  /// Caution: this function may hurt performance
+  /// if improper parameters provided.
+  /// This function may not work well on some chips (e.g. MTK). Setting thread
+  /// affinity to offline cores may run very slow or unexpectedly.
+  /// In such cases, please use SetOpenMPThreadPolicy with default policy
+  /// instead.
+  ///
+  /// \param num_threads
+  /// \param cpu_ids
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetOpenMPThreadAffinity(
+      int num_threads,
+      const std::vector<int> &cpu_ids);
+
+  DeviceType device_type() const;
+
+  int num_threads() const;
+
+  std::shared_ptr<GPUContext> gpu_context() const;
+
+  GPUPriorityHint gpu_priority_hint() const;
+
+  GPUPerfHint gpu_perf_hint() const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
 // MACE input/output tensor
-class __attribute__((visibility("default"))) MaceTensor {
+class MACE_API MaceTensor {
  public:
   // shape - the shape of the tensor, with size n
   // data - the buffer of the tensor, must not be null with size equals
   //        shape[0] * shape[1] * ... * shape[n-1]
-  explicit MaceTensor(const std::vector<int64_t> &shape,
-                      std::shared_ptr<float> data);
+  MaceTensor(const std::vector<int64_t> &shape,
+             std::shared_ptr<float> data);
   MaceTensor();
   MaceTensor(const MaceTensor &other);
   MaceTensor(const MaceTensor &&other);
@@ -97,9 +274,9 @@ class __attribute__((visibility("default"))) MaceTensor {
   std::unique_ptr<Impl> impl_;
 };
 
-class __attribute__((visibility("default"))) MaceEngine {
+class MACE_API MaceEngine {
  public:
-  explicit MaceEngine(DeviceType device_type);
+  explicit MaceEngine(const MaceEngineConfig &config);
   ~MaceEngine();
 
   MaceStatus Init(const NetDef *net_def,
@@ -135,18 +312,16 @@ class __attribute__((visibility("default"))) MaceEngine {
 /// \param model_data_file[in]: the path of model data file
 /// \param input_nodes[in]: the array of input nodes' name
 /// \param output_nodes[in]: the array of output nodes' name
-/// \param device_type[in]: one of [CPU, GPU, HEXAGON],
-///        based on the runtime type of your model deployment file.
+/// \param config[in]: configurations for MaceEngine.
 /// \param engine[out]: output MaceEngine object
 /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
 ///         MACE_OUT_OF_RESOURCES for resources is out of range.
-__attribute__((visibility("default")))
-MaceStatus CreateMaceEngineFromProto(
+MACE_API MaceStatus CreateMaceEngineFromProto(
     const std::vector<unsigned char> &model_pb,
     const std::string &model_data_file,
     const std::vector<std::string> &input_nodes,
     const std::vector<std::string> &output_nodes,
-    const DeviceType device_type,
+    const MaceEngineConfig &config,
     std::shared_ptr<MaceEngine> *engine);
 
 }  // namespace mace
diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h
deleted file mode 100644
index 4cd60d2b60633c7df5c30de45ecd26df64581cc3..0000000000000000000000000000000000000000
--- a/mace/public/mace_runtime.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file defines runtime tuning APIs.
-// These APIs are not stable.
-
-#ifndef MACE_PUBLIC_MACE_RUNTIME_H_
-#define MACE_PUBLIC_MACE_RUNTIME_H_
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "mace/public/mace.h"
-
-namespace mace {
-
-enum GPUPerfHint {
-  PERF_DEFAULT = 0,
-  PERF_LOW = 1,
-  PERF_NORMAL = 2,
-  PERF_HIGH = 3
-};
-
-enum GPUPriorityHint {
-  PRIORITY_DEFAULT = 0,
-  PRIORITY_LOW = 1,
-  PRIORITY_NORMAL = 2,
-  PRIORITY_HIGH = 3
-};
-
-enum CPUAffinityPolicy {
-  AFFINITY_NONE = 0,
-  AFFINITY_BIG_ONLY = 1,
-  AFFINITY_LITTLE_ONLY = 2,
-};
-
-class KVStorage {
- public:
-  // return: 0 for success, -1 for error
-  virtual int Load() = 0;
-  virtual void Clear() = 0;
-  virtual bool Insert(const std::string &key,
-                      const std::vector<unsigned char> &value) = 0;
-  virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
-  // return: 0 for success, -1 for error
-  virtual int Flush() = 0;
-  virtual ~KVStorage() {}
-};
-
-class KVStorageFactory {
- public:
-  virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
-};
-
-class __attribute__((visibility("default"))) FileStorageFactory
-    : public KVStorageFactory {
- public:
-  // You have to make sure your APP have read and write permission of the path.
-  explicit FileStorageFactory(const std::string &path);
-
-  ~FileStorageFactory();
-
-  std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
-
- private:
-  class Impl;
-  std::unique_ptr<Impl> impl_;
-};
-
-/// \brief Set internal storage factory to store internal data. (Call once)
-///
-/// Now the path is used to store the built OpenCL binaries to file,
-/// which could speed up the GPU initialization and first run.
-/// If do not call this API, the initialization maybe slow for GPU.
-///
-/// \param path  Make sure your program have Read/Write permission of the path
-/// \return
-__attribute__((visibility("default")))
-void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
-
-/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)  // NOLINT(whitespace/line_length)
-///
-/// Just call once. (Not thread-safe)
-/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization.  // NOLINT(whitespace/line_length)
-/// OpenCL binary is corresponding to the OpenCL Driver version,
-/// you should update the binary when OpenCL Driver changed.
-///
-/// \param paths MACE will use first file found in all paths
-/// \return
-__attribute__((visibility("default")))
-void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
-
-/// \brief Set the path of Generated OpenCL parameter file
-///
-/// Just call once. (Not thread-safe)
-/// If you use gpu for specific soc, The parameters is the local work group
-/// size tuned for specific SOC, which may be faster than the
-/// general parameters.
-///
-/// \param path Make sure your program have Read/Write permission of the path
-/// \return
-__attribute__((visibility("default")))
-void SetOpenCLParameterPath(const std::string &path);
-
-/// \brief Set GPU hints, currently only supports Adreno GPU.
-///
-/// Caution: this function may hurt performance
-/// if improper parameters provided.
-///
-/// \param perf_hint  performance hint
-/// \param priority_hint  priority hint
-/// \return
-__attribute__((visibility("default")))
-void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
-
-/// \brief Set OpenMP threads number and affinity policy.
-///
-/// Caution: this function may hurt performance if improper parameters provided.
-/// When num_threads_hint is zero or negative,
-/// the function will set the threads number equaling to the number of
-/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
-/// (AFFINITY_NONE) cores according to the policy. The threads number will
-/// also be truncated to the corresponding cores number when num_threads_hint
-/// is larger than it.
-/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
-/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY).
-/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for
-/// quantized inference.
-///
-/// \param num_threads_hint it is only a hint.
-/// \param policy one of CPUAffinityPolicy
-/// \param use_gemmlowp use gemmlowp for quantized inference
-/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE
-/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use
-/// AFFINITY_NONE to use all cores.
-__attribute__((visibility("default")))
-MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
-                                 CPUAffinityPolicy policy,
-                                 bool use_gemmlowp = false);
-
-/// \brief Set OpenMP threads number and processor affinity.
-///
-/// Caution: this function may hurt performance
-/// if improper parameters provided.
-/// This function may not work well on some chips (e.g. MTK). Setting thread
-/// affinity to offline cores may run very slow or unexpectedly.
-/// In such cases, please use SetOpenMPThreadPolicy with default policy
-/// instead.
-///
-/// \param num_threads
-/// \param cpu_ids
-/// \return
-__attribute__((visibility("default")))
-MaceStatus SetOpenMPThreadAffinity(int num_threads,
-                                   const std::vector<int> &cpu_ids);
-
-/// \brief Get ARM big.LITTLE configuration.
-///
-/// This function will detect the max frequencies of all CPU cores, and assume
-/// the cores with largest max frequencies as big cores, and all the remaining
-/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
-/// little_core_ids will both be filled with all cpu core ids.
-///
-/// \param [out] big_core_ids
-/// \param [out] little_core_ids
-/// \return If successful, it returns MACE_SUCCESS and error if it can't
-///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
-__attribute__((visibility("default")))
-MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                               std::vector<int> *little_core_ids);
-}  // namespace mace
-
-#endif  // MACE_PUBLIC_MACE_RUNTIME_H_
diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2
index 472879365035bfe8a9ac945766dd559e94d72bf4..2bdda1439f039be6cfd88337a269f5cc83d23fa3 100644
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 
 namespace mace {
 
@@ -57,8 +56,7 @@ std::map<std::string, int> model_name_map {
 ///        if model_data_format is code, just pass empty string("")
 /// \param input_nodes[in]: the array of input nodes' name
 /// \param output_nodes[in]: the array of output nodes' name
-/// \param device_type[in]: one of [CPU, GPU, HEXAGON],
-///        based on the runtime type of your model deployment file.
+/// \param config[in]: configurations for MaceEngine.
 /// \param engine[out]: output MaceEngine object
 /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
 ///         MACE_OUT_OF_RESOURCES for resources is out of range.
@@ -67,7 +65,7 @@ MaceStatus CreateMaceEngineFromCode(
     const std::string &model_data_file,
     const std::vector<std::string> &input_nodes,
     const std::vector<std::string> &output_nodes,
-    const DeviceType device_type,
+    const MaceEngineConfig &config,
     std::shared_ptr<MaceEngine> *engine) {
   // load model
   if (engine == nullptr) {
@@ -83,7 +81,7 @@ MaceStatus CreateMaceEngineFromCode(
 {% for i in range(model_tags |length) %}
     case {{ i }}:
       net_def = mace::{{model_tags[i]}}::CreateNet();
-      engine->reset(new mace::MaceEngine(device_type));
+      engine->reset(new mace::MaceEngine(config));
 {% if embed_model_data %}
       model_data = mace::{{model_tags[i]}}::LoadModelData();
       status = (*engine)->Init(net_def.get(), input_nodes, output_nodes,
diff --git a/mace/test/BUILD b/mace/test/BUILD
index 09c9e030f0376ed3d6530e3b8fb155384e3c648e..04253cda9a117cd6b7905837e8e4a09ffdd1ca21 100644
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
@@ -1,6 +1,3 @@
-# Description:
-# Mace operators.
-#
 package(
     default_visibility = ["//visibility:public"],
 )
diff --git a/mace/test/mace_api_exception_test.cc b/mace/test/mace_api_exception_test.cc
index 1eaad03726165987ce00c6df70d0b23f438a2231..7507ffc8319823554cce4d1273e023c7c87988cb 100644
--- a/mace/test/mace_api_exception_test.cc
+++ b/mace/test/mace_api_exception_test.cc
@@ -23,7 +23,9 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
   input_names.push_back(MakeString("input", 0));
   output_names.push_back(MakeString("output", 0));
 
-  const DeviceType device = DeviceType::GPU;
+  MaceEngineConfig config(DeviceType::GPU);
+  config.SetGPUContext(
+      ops::test::OpTestContext::Get()->gpu_context());
 
   std::shared_ptr<NetDef> net_def(new NetDef());
   for (size_t i = 0; i < input_names.size(); ++i) {
@@ -31,7 +33,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
     info->set_name(input_names[i]);
   }
 
-  MaceEngine engine(device);
+  MaceEngine engine(config);
   ASSERT_DEATH(engine.Init(net_def.get(), {"input"}, output_names, nullptr),
                "");
 }
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index e2a09fec8d3991fd8dad65b8427ae61ea35b8c3a..6d554bbe3dbfbd88f338e2602c77ec6f86a2317d 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -18,7 +18,6 @@
 #include "mace/core/operator.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/public/mace_runtime.h"
 
 namespace mace {
 namespace test {
@@ -200,7 +199,7 @@ void CheckOutputs(const NetDef &net_def,
 
   for (auto output : outputs) {
     std::unique_ptr<Tensor> tmp_tensor(
-        new Tensor(GetDeviceAllocator(DeviceType::CPU),
+        new Tensor(GetCPUAllocator(),
                    DataTypeToEnum<float>::v()));
     auto output_shape = output.second.shape();
     const int64_t data_size = std::accumulate(output_shape.begin(),
@@ -333,13 +332,9 @@ void MaceRunFunc(const int in_out_size) {
     OutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
+  MaceEngineConfig config(DeviceType::GPU);
 
-  const std::string file_path ="/data/local/tmp/mace";
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(file_path));
-  mace::SetKVStorageFactory(storage_factory);
-
-  MaceEngine engine(device);
+  MaceEngine engine(config);
   MaceStatus status = engine.Init(net_def.get(), input_names, output_names,
       reinterpret_cast<unsigned char *>(data.data()));
   EXPECT_EQ(status, MaceStatus::MACE_SUCCESS);
@@ -367,7 +362,7 @@ TEST_F(MaceMTAPITest, MultipleThread) {
   const int thread_num = 10;
   std::vector<std::thread> threads;
   for (int i = 0; i < thread_num; ++i) {
-    threads.push_back(std::thread(MaceRunFunc, i));
+    threads.push_back(std::thread(MaceRunFunc, 1));
   }
   for (auto &t : threads) {
     t.join();
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 6b1f353eb8f7a3d77e59b84f23fcf3141bfef148..83d3b33dfb1894a486197af41c7344608bff6e9a 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -18,7 +18,7 @@
 #include "mace/core/operator.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/public/mace_runtime.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace test {
@@ -199,9 +199,10 @@ void CheckOutputs(const NetDef &net_def,
   }
   net.RunNet(net_def, D);
 
+  std::unique_ptr<Allocator> allocator(new CPUAllocator);
   for (auto output : outputs) {
     std::unique_ptr<Tensor> tmp_tensor(
-        new Tensor(GetDeviceAllocator(DeviceType::CPU),
+        new Tensor(allocator.get(),
                    DataTypeToEnum<float>::v()));
     auto output_shape = output.second.shape();
     const int64_t data_size = std::accumulate(output_shape.begin(),
@@ -333,7 +334,9 @@ void MaceRun(const int in_out_size,
     info->set_name(output_names[i]);
   }
 
-  MaceEngine engine(device);
+  MaceEngineConfig config(DeviceType::GPU);
+
+  MaceEngine engine(config);
   MaceStatus status = engine.Init(net_def.get(), input_names, output_names,
       reinterpret_cast<unsigned char *>(data.data()));
   EXPECT_EQ(status, MaceStatus::MACE_SUCCESS);
diff --git a/mace/tools/quantization/quantize_stat.cc b/mace/tools/quantization/quantize_stat.cc
index a05f42f70621ff558d790e2f0249534f0a0271f2..936196e3065705da3788d2616e4afa79335b56d1 100644
--- a/mace/tools/quantization/quantize_stat.cc
+++ b/mace/tools/quantization/quantize_stat.cc
@@ -33,7 +33,6 @@
 
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
@@ -122,8 +121,15 @@ bool RunModel(const std::string &model_name,
               const std::vector<std::vector<int64_t>> &input_shapes,
               const std::vector<std::string> &output_names,
               const std::vector<std::vector<int64_t>> &output_shapes) {
-  MACE_RETURN_IF_ERROR(mace::SetOpenMPThreadPolicy(
-      FLAGS_omp_num_threads, CPUAffinityPolicy::AFFINITY_NONE));
+  // config runtime
+  MaceStatus status;
+  MaceEngineConfig config(DeviceType::CPU);
+  status = config.SetCPUThreadPolicy(
+      FLAGS_omp_num_threads,
+      CPUAffinityPolicy::AFFINITY_NONE);
+  if (status != MACE_SUCCESS) {
+    LOG(WARNING) << "Set openmp or cpu affinity failed.";
+  }
 
   std::vector<unsigned char> model_pb_data;
   if (FLAGS_model_file != "") {
@@ -141,7 +147,7 @@ bool RunModel(const std::string &model_name,
                                  FLAGS_model_data_file,
                                  input_names,
                                  output_names,
-                                 DeviceType::CPU,
+                                 config,
                                  &engine));
 #else
   (void) (model_name);
@@ -150,7 +156,7 @@ bool RunModel(const std::string &model_name,
                                 FLAGS_model_data_file,
                                 input_names,
                                 output_names,
-                                DeviceType::CPU,
+                                config,
                                 &engine));
 #endif
 
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 0aeefb789c9b6720c5b7cd3814497cf2dbf22d23..3873e5dd3fa80efc6aa90e1d2fdd483a26d1f423 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -33,7 +33,6 @@
 
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
@@ -203,35 +202,37 @@ bool RunModel(const std::string &model_name,
               const std::vector<std::vector<int64_t>> &output_shapes) {
   DeviceType device_type = ParseDeviceType(FLAGS_device);
   // config runtime
-  MaceStatus status = mace::SetOpenMPThreadPolicy(
-      FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
-      true);
+  MaceStatus status;
+  MaceEngineConfig config(device_type);
+  status = config.SetCPUThreadPolicy(
+          FLAGS_omp_num_threads,
+          static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
+          true);
   if (status != MACE_SUCCESS) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
   if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
+    const std::string storage_path =
+        std::string(storage_path_ptr == nullptr ?
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
     std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
 
-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
   }
 #endif  // MACE_ENABLE_OPENCL
 
-  const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
-  const std::string kernel_file_path =
-      std::string(kernel_path == nullptr ?
-                  "/data/local/tmp/mace_run/interior" : kernel_path);
-
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(kernel_file_path));
-  SetKVStorageFactory(storage_factory);
-
   std::vector<unsigned char> model_pb_data;
   if (FLAGS_model_file != "") {
     if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) {
@@ -252,7 +253,7 @@ bool RunModel(const std::string &model_name,
                                    FLAGS_model_data_file,
                                    input_names,
                                    output_names,
-                                   device_type,
+                                   config,
                                    &engine);
 #else
     (void)(model_name);
@@ -261,7 +262,7 @@ bool RunModel(const std::string &model_name,
                                   FLAGS_model_data_file,
                                   input_names,
                                   output_names,
-                                  device_type,
+                                  config,
                                   &engine);
 #endif
     int64_t t1 = NowMicros();
@@ -326,7 +327,7 @@ bool RunModel(const std::string &model_name,
                                    FLAGS_model_data_file,
                                    input_names,
                                    output_names,
-                                   device_type,
+                                   config,
                                    &engine);
 #else
         create_engine_status =
@@ -334,7 +335,7 @@ bool RunModel(const std::string &model_name,
                                       FLAGS_model_data_file,
                                       input_names,
                                       output_names,
-                                      device_type,
+                                      config,
                                       &engine);
 #endif
       } while (create_engine_status != MACE_SUCCESS);
@@ -366,7 +367,7 @@ bool RunModel(const std::string &model_name,
                                        FLAGS_model_data_file,
                                        input_names,
                                        output_names,
-                                       device_type,
+                                       config,
                                        &engine);
 #else
             create_engine_status =
@@ -374,7 +375,7 @@ bool RunModel(const std::string &model_name,
                                           FLAGS_model_data_file,
                                           input_names,
                                           output_names,
-                                          device_type,
+                                          config,
                                           &engine);
 #endif
           } while (create_engine_status != MACE_SUCCESS);
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index e4007b6694a415e7a058b5f6f33a93a2ba485e8e..3295ddaec12e3703c9839e19e55414fca873dcaf 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -15,6 +15,8 @@
 #ifndef MACE_UTILS_TUNER_H_
 #define MACE_UTILS_TUNER_H_
 #include <stdlib.h>
+
+#include <cstring>
 #include <fstream>
 #include <functional>
 #include <limits>
@@ -29,18 +31,24 @@
 
 namespace mace {
 
+inline bool IsTuning() {
+  const char *tuning = getenv("MACE_TUNING");
+  return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
+}
+
 template <typename param_type>
 class Tuner {
  public:
-  static Tuner *Get() {
-    static Tuner tuner;
-    return &tuner;
+  explicit Tuner(const std::string tuned_param_file_path = ""):
+      tuned_param_file_path_(tuned_param_file_path) {
+    path_ = getenv("MACE_RUN_PARAMETER_PATH");
+    ReadRunParamters();
   }
 
-  inline bool IsTuning() {
-    const char *tuning = getenv("MACE_TUNING");
-    return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
-  }
+  ~Tuner() { WriteRunParameters(); }
+
+  Tuner(const Tuner &) = delete;
+  Tuner &operator=(const Tuner &) = delete;
 
   template <typename RetType>
   RetType TuneOrRun(
@@ -76,16 +84,6 @@ class Tuner {
   }
 
  private:
-  Tuner() {
-    path_ = getenv("MACE_RUN_PARAMETER_PATH");
-    ReadRunParamters();
-  }
-
-  ~Tuner() { WriteRunParameters(); }
-
-  Tuner(const Tuner &) = delete;
-  Tuner &operator=(const Tuner &) = delete;
-
   inline void WriteRunParameters() {
     if (path_ != nullptr) {
       VLOG(3) << "Write tuning result to " << path_;
@@ -117,9 +115,9 @@ class Tuner {
   }
 
   inline void ReadRunParamters() {
-    extern std::string kOpenCLParameterPath;
-    if (!kOpenCLParameterPath.empty()) {
-      std::ifstream ifs(kOpenCLParameterPath, std::ios::binary | std::ios::in);
+    if (!tuned_param_file_path_.empty()) {
+      std::ifstream ifs(tuned_param_file_path_,
+                        std::ios::binary | std::ios::in);
       if (ifs.is_open()) {
         int64_t num_params = 0;
         ifs.read(reinterpret_cast<char *>(&num_params), sizeof(num_params));
@@ -144,7 +142,7 @@ class Tuner {
         LOG(WARNING) << "Read OpenCL tuned parameters file failed.";
       }
     } else {
-      LOG(INFO) << "There is no tuned parameters.";
+      VLOG(1) << "There is no tuned parameters.";
     }
   }
 
@@ -207,6 +205,7 @@ class Tuner {
   }
 
  private:
+  std::string tuned_param_file_path_;
   const char *path_;
   std::unordered_map<std::string, std::vector<param_type>> param_table_;
 };
diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc
index bd590ac90f764849f6cc91c23a829b575a5c9b68..bff02b0bd4179f25b7bb732cfd61cee6159eba79 100644
--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
@@ -42,15 +42,16 @@ TEST_F(TunerTest, SimpleRun) {
     }
   };
 
+  Tuner<unsigned int> tuner;
   WallClockTimer timer;
   std::vector<unsigned int> default_params(1, 1);
-  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  int res = tuner.TuneOrRun<unsigned int>(
       "SimpleRun", default_params, nullptr, TunerFunc, &timer);
 
   EXPECT_EQ(expect, res);
 
   default_params[0] = 2;
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  res = tuner.TuneOrRun<unsigned int>(
       "SimpleRun", default_params, nullptr, TunerFunc, &timer);
   EXPECT_EQ(expect + 1, res);
 }
@@ -88,13 +89,14 @@ TEST_F(TunerTest, SimpleTune) {
     return {{1}, {2}, {3}, {4}};
   };
   // tune
+  Tuner<unsigned int> tuner;
   WallClockTimer timer;
-  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  int res = tuner.TuneOrRun<unsigned int>(
       "SimpleRun", default_params, *params_generator, TunerFunc, &timer);
   EXPECT_EQ(expect, res);
 
   // run
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  res = tuner.template TuneOrRun<unsigned int>(
       "SimpleRun", default_params, nullptr, TunerFunc, &timer);
   EXPECT_EQ(expect, res);
 }