Refactor configuration APIs and Remove some global static variables.

a992621c · liuqi · 赵奇可 · a7ff559c · a992621c · a992621c
205 changed file
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -22,7 +22,6 @@
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
 #include "mace/benchmark/statistics.h"
@@ -257,36 +256,40 @@ int Main(int argc, char **argv) {
  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
-  // config runtime
+  // configuration
-  MaceStatus ret = mace::SetOpenMPThreadPolicy(
+  MaceStatus mace_status;
+  MaceEngineConfig config(device_type);
+  mace_status = config.SetCPUThreadPolicy(
      FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
+      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
      true);
-  if (ret != MACE_SUCCESS) {
+  if (mace_status != MACE_SUCCESS) {
-    LOG(WARNING) << "Set openmp or cpu affinity failed.";
+    LOG(INFO) << "Set openmp or cpu affinity failed.";
  }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
  if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
+    // DO NOT USE tmp directory.
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+    // Please use APP's own directory and make sure the directory exists.
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
+    const std::string storage_path =
+        std::string(storage_path_ptr == nullptr ?
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
 #endif  // MACE_ENABLE_OPENCL
-  const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
-  const std::string kernel_file_path =
-      std::string(kernel_path == nullptr ?
-                  "/data/local/tmp/mace_run/interior" : kernel_path);
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(kernel_file_path));
-  SetKVStorageFactory(storage_factory);
  // Create Engine
  std::shared_ptr<mace::MaceEngine> engine;
  MaceStatus create_engine_status;
@@ -306,7 +309,7 @@ int Main(int argc, char **argv) {
                                 model_data_file_ptr,
                                 input_names,
                                 output_names,
-                                 device_type,
+                                 config,
                                 &engine);
 #else
  create_engine_status =
@@ -314,7 +317,7 @@ int Main(int argc, char **argv) {
                                model_data_file_ptr,
                                input_names,
                                output_names,
-                                device_type,
+                                config,
                                &engine);
 #endif
  if (create_engine_status != MaceStatus::MACE_SUCCESS) {

--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -13,30 +13,12 @@
 // limitations under the License.
 #include "mace/core/allocator.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_allocator.h"
-#endif
 namespace mace {
-std::map<int32_t, Allocator *> *gAllocatorRegistry() {
+Allocator *GetCPUAllocator() {
-  static std::map<int32_t, Allocator *> g_allocator_registry;
+  static CPUAllocator allocator;
-  return &g_allocator_registry;
+  return &allocator;
 }
-Allocator *GetDeviceAllocator(DeviceType type) {
-  auto iter = gAllocatorRegistry()->find(type);
-  if (iter == gAllocatorRegistry()->end()) {
-    LOG(ERROR) << "Allocator not found for device " << type;
-    return nullptr;
-  }
-  return iter->second;
-}
-MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
-#ifdef MACE_ENABLE_OPENCL
-MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
-#endif
-MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());
 }  // namespace mace
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -26,8 +26,6 @@
 #include "mace/core/registry.h"
 #include "mace/core/types.h"
 #include "mace/core/runtime_failure_mock.h"
-#include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 namespace mace {
@@ -138,26 +136,8 @@ class CPUAllocator : public Allocator {
  bool OnHost() const override { return true; }
 };
-std::map<int32_t, Allocator *> *gAllocatorRegistry();
+// Global CPU allocator used for CPU/GPU/DSP
+Allocator *GetCPUAllocator();
-Allocator *GetDeviceAllocator(DeviceType type);
-struct AllocatorRegisterer {
-  explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) {
-    if (gAllocatorRegistry()->count(type)) {
-      LOG(ERROR) << "Allocator for device type " << type
-                 << " registered twice. This should not happen."
-                 << gAllocatorRegistry()->count(type);
-      std::exit(1);
-    }
-    gAllocatorRegistry()->emplace(type, alloc);
-  }
-};
-#define MACE_REGISTER_ALLOCATOR(type, alloc)                                  \
-  namespace {                                                                 \
-  static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \
-  }
 }  // namespace mace

--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"
 namespace mace {

--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -218,9 +218,9 @@ class Buffer : public BufferBase {
 class Image : public BufferBase {
 public:
-  Image()
+  explicit Image(Allocator *allocator)
      : BufferBase(0),
-        allocator_(GetDeviceAllocator(GPU)),
+        allocator_(allocator),
        buf_(nullptr),
        mapped_buf_(nullptr) {}

--- a/mace/core/device.cc
+++ b/mace/core/device.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/device.h"
+namespace mace {
+CPUDevice::CPUDevice(const int num_threads)
+    : cpu_runtime_(new CPURuntime(num_threads)) {}
+CPUDevice::~CPUDevice() = default;
+CPURuntime *CPUDevice::cpu_runtime() {
+  return cpu_runtime_.get();
+}
+#ifdef MACE_ENABLE_OPENCL
+OpenCLRuntime *CPUDevice::opencl_runtime() {
+  return nullptr;
+}
+#endif
+Allocator *CPUDevice::allocator() {
+  return GetCPUAllocator();
+}
+DeviceType CPUDevice::device_type() const {
+  return DeviceType::CPU;
+}
+}  // namespace mace
--- a/mace/core/device.h
+++ b/mace/core/device.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_DEVICE_H_
+#define MACE_CORE_DEVICE_H_
+#include <memory>
+#include "mace/core/runtime/cpu/cpu_runtime.h"
+#include "mace/core/allocator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#endif
+namespace mace {
+class Device {
+ public:
+  virtual ~Device() {}
+#ifdef MACE_ENABLE_OPENCL
+  virtual OpenCLRuntime *opencl_runtime() = 0;
+#endif
+  virtual CPURuntime *cpu_runtime() = 0;
+  virtual Allocator *allocator() = 0;
+  virtual DeviceType device_type() const = 0;
+};
+class CPUDevice : public Device {
+ public:
+  explicit CPUDevice(const int num_threads);
+  virtual ~CPUDevice();
+#ifdef MACE_ENABLE_OPENCL
+  OpenCLRuntime *opencl_runtime() override;
+#endif
+  CPURuntime *cpu_runtime() override;
+  Allocator *allocator() override;
+  DeviceType device_type() const override;
+ private:
+  std::unique_ptr<CPURuntime> cpu_runtime_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_DEVICE_H_
--- a/mace/core/device_context.cc
+++ b/mace/core/device_context.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/device_context.h"
+#include <sys/stat.h>
+namespace mace {
+namespace {
+const char *kPrecompiledProgramFileName = "mace_cl_compiled_program.bin";
+std::string FindFirstExistPath(const std::vector<std::string> &paths) {
+  std::string result;
+  struct stat st;
+  for (auto path : paths) {
+    if (stat(path.c_str(), &st) == 0) {
+      if (S_ISREG(st.st_mode)) {
+        result = path;
+        break;
+      }
+    }
+  }
+  return result;
+}
+}  // namespace
+GPUContext::GPUContext(const std::string &storage_path,
+                       const std::vector<std::string> &opencl_binary_paths,
+                       const std::string &opencl_parameter_path)
+    : storage_factory_(new FileStorageFactory(storage_path)),
+      opencl_tuner_(new Tuner<uint32_t>(opencl_parameter_path)) {
+  if (!storage_path.empty()) {
+    opencl_cache_storage_ =
+        storage_factory_->CreateStorage(kPrecompiledProgramFileName);
+  }
+  std::string precompiled_binary_path =
+      FindFirstExistPath(opencl_binary_paths);
+  if (!precompiled_binary_path.empty()) {
+    opencl_binary_storage_.reset(
+        new FileStorage(precompiled_binary_path));
+  }
+}
+GPUContext::~GPUContext() = default;
+KVStorage *GPUContext::opencl_binary_storage() {
+  return opencl_binary_storage_.get();
+}
+KVStorage *GPUContext::opencl_cache_storage() {
+  return opencl_cache_storage_.get();
+}
+Tuner<uint32_t> *GPUContext::opencl_tuner() {
+  return opencl_tuner_.get();
+}
+}  // namespace mace
--- a/mace/core/device_context.h
+++ b/mace/core/device_context.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_DEVICE_CONTEXT_H_
+#define MACE_CORE_DEVICE_CONTEXT_H_
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+#include "mace/core/file_storage.h"
+#include "mace/utils/tuner.h"
+namespace mace {
+class GPUContext {
+ public:
+  GPUContext(const std::string &storage_path = "",
+             const std::vector<std::string> &opencl_binary_path = {},
+             const std::string &opencl_parameter_path = "");
+  ~GPUContext();
+  KVStorage *opencl_binary_storage();
+  KVStorage *opencl_cache_storage();
+  Tuner<uint32_t> *opencl_tuner();
+ private:
+  std::unique_ptr<KVStorageFactory> storage_factory_;
+  std::unique_ptr<Tuner<uint32_t>> opencl_tuner_;
+  std::unique_ptr<KVStorage> opencl_binary_storage_;
+  std::unique_ptr<KVStorage> opencl_cache_storage_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_DEVICE_CONTEXT_H_
--- a/mace/core/file_storage.cc
+++ b/mace/core/file_storage.cc
@@ -28,10 +28,36 @@
 namespace mace {
-std::shared_ptr<KVStorageFactory> kStorageFactory = nullptr;
+class FileStorageFactory::Impl {
+ public:
+  explicit Impl(const std::string &path);
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
+ private:
+  std::string path_;
+};
+FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
+std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
+    const std::string &name) {
+  return std::move(std::unique_ptr<KVStorage>(
+      new FileStorage(path_ + "/" + name)));
+}
+FileStorageFactory::FileStorageFactory(const std::string &path):
+    impl_(new FileStorageFactory::Impl(path)) {}
+FileStorageFactory::~FileStorageFactory() = default;
+std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
+    const std::string &name) {
+  return impl_->CreateStorage(name);
+}
 FileStorage::FileStorage(const std::string &file_path):
-    data_changed_(false), file_path_(file_path) {}
+    loaded_(false), data_changed_(false), file_path_(file_path) {}
 int FileStorage::Load() {
  struct stat st;
@@ -47,6 +73,9 @@ int FileStorage::Load() {
    }
  }
  utils::WriteLock lock(&data_mutex_);
+  if (loaded_) {
+    return 0;
+  }
  int fd = open(file_path_.c_str(), O_RDONLY);
  if (fd < 0) {
    if (errno == ENOENT) {
@@ -118,13 +147,17 @@ int FileStorage::Load() {
                 << " failed, error code: " << strerror(errno);
    return -1;
  }
+  loaded_ = true;
  return 0;
 }
-void FileStorage::Clear() {
+bool FileStorage::Clear() {
  utils::WriteLock lock(&data_mutex_);
-  data_.clear();
+  if (!data_.empty()) {
-  data_changed_ = true;
+    data_.clear();
+    data_changed_ = true;
+  }
+  return true;
 }
 bool FileStorage::Insert(const std::string &key,

--- a/mace/core/file_storage.h
+++ b/mace/core/file_storage.h
@@ -16,27 +16,64 @@
 #define MACE_CORE_FILE_STORAGE_H_
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
-#include "mace/public/mace_runtime.h"
+#include "mace/public/mace.h"
 #include "mace/utils/rwlock.h"
 namespace mace {
+class KVStorage {
+ public:
+  // return: 0 for success, -1 for error
+  virtual int Load() = 0;
+  virtual bool Clear() = 0;
+  // insert or update the key-value.
+  virtual bool Insert(const std::string &key,
+                      const std::vector<unsigned char> &value) = 0;
+  virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
+  // return: 0 for success, -1 for error
+  virtual int Flush() = 0;
+  virtual ~KVStorage() {}
+};
+class KVStorageFactory {
+ public:
+  virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
+  virtual ~KVStorageFactory() {}
+};
+class FileStorageFactory : public KVStorageFactory {
+ public:
+  // You have to make sure your APP have read and write permission of the path.
+  explicit FileStorageFactory(const std::string &path);
+  ~FileStorageFactory();
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
 class FileStorage : public KVStorage {
 public:
  explicit FileStorage(const std::string &file_path);
 public:
  int Load() override;
-  void Clear() override;
+  bool Clear() override;
  bool Insert(const std::string &key,
              const std::vector<unsigned char> &value) override;
  const std::vector<unsigned char> *Find(const std::string &key) override;
  int Flush() override;
 private:
+  bool loaded_;
  bool data_changed_;
  std::string file_path_;
  std::map<std::string, std::vector<unsigned char>> data_;

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -18,6 +18,7 @@
 #include "mace/core/macros.h"
 #include "mace/core/net.h"
+#include "mace/public/mace.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"
 #include "mace/utils/utils.h"
@@ -27,30 +28,35 @@ namespace mace {
 NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
                 const std::shared_ptr<const NetDef> net_def,
                 Workspace *ws,
-                 DeviceType type)
+                 Device *device)
    : name_(net_def->name()), op_registry_(op_registry) {
  MACE_UNUSED(ws);
-  MACE_UNUSED(type);
+  MACE_UNUSED(device);
 }
 SerialNet::SerialNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const std::shared_ptr<const NetDef> net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode)
-    : NetBase(op_registry, net_def, ws, type), device_type_(type) {
+    : NetBase(op_registry, net_def, ws, device), device_(device),
+      op_kernel_context_(new OpKernelContext(ws, device)) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
+  DeviceType device_type = device->device_type();
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
    // TODO(liuqi): refactor to add device_type to OperatorDef
    const int op_device =
        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(device_type_));
+            operator_def, "device", static_cast<int>(device_type));
-    if (op_device == type) {
+    if (op_device == device_type) {
+      VLOG(3) << "Creating operator " << operator_def.name() << "("
+              << operator_def.type() << ")";
      OperatorDef temp_def(operator_def);
      std::unique_ptr<OperatorBase> op(
-          op_registry->CreateOperator(temp_def, ws, type, mode));
+          op_registry->CreateOperator(temp_def, op_kernel_context_.get(),
+                                      device_type, mode));
      if (op) {
        operators_.emplace_back(std::move(op));
      }
@@ -61,13 +67,14 @@ SerialNet::SerialNet(
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
+  const DeviceType device_type = device_->device_type();
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
                        op->debug_def().type(), "), mem_id: ",
                        MakeListString(op->debug_def().mem_id().data(),
                                       op->debug_def().mem_id().size()));
-    bool future_wait = (device_type_ == DeviceType::GPU &&
+    bool future_wait = (device_type == DeviceType::GPU &&
                        (run_metadata != nullptr ||
                         std::distance(iter, operators_.end()) == 1));
@@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
      } else {
        future.wait_fn(nullptr);
      }
+#ifdef MACE_ENABLE_OPENCL
+      device_->opencl_runtime()->command_queue().finish();
+#endif
    } else if (run_metadata != nullptr) {
      call_stats.start_micros = NowMicros();
      MACE_RETURN_IF_ERROR(op->Run(nullptr));
@@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
    VLOG(3) << "Operator " << op->debug_def().name()
            << " has shape: " << MakeString(op->Output(0)->shape());
-    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type_ == CPU) {
+    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) {
      for (int i = 0; i < op->OutputSize(); ++i) {
        int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
        if (data_type == static_cast<int>(DT_FLOAT)) {
@@ -151,20 +161,20 @@ std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const NetDef &net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode) {
  std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
-  return CreateNet(op_registry, tmp_net_def, ws, type, mode);
+  return CreateNet(op_registry, tmp_net_def, ws, device, mode);
 }
 std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const std::shared_ptr<const NetDef> net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode) {
  std::unique_ptr<NetBase> net(
-      new SerialNet(op_registry, net_def, ws, type, mode));
+      new SerialNet(op_registry, net_def, ws, device, mode));
  return net;
 }

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "mace/core/operator.h"
-#include "mace/public/mace.h"
 namespace mace {
@@ -33,7 +32,7 @@ class NetBase {
  NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
          const std::shared_ptr<const NetDef> net_def,
          Workspace *ws,
-          DeviceType type);
+          Device *device);
  virtual ~NetBase() noexcept {}
  virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;
@@ -52,14 +51,15 @@ class SerialNet : public NetBase {
  SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry,
            const std::shared_ptr<const NetDef> net_def,
            Workspace *ws,
-            DeviceType type,
+            Device *device,
            const NetMode mode = NetMode::NORMAL);
  MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
 protected:
  std::vector<std::unique_ptr<OperatorBase> > operators_;
-  DeviceType device_type_;
+  Device *device_;
+  std::unique_ptr<OpKernelContext> op_kernel_context_;
  MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
 };
@@ -68,13 +68,13 @@ std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const NetDef &net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode = NetMode::NORMAL);
 std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const std::shared_ptr<const NetDef> net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode = NetMode::NORMAL);
 }  // namespace mace

--- a/mace/core/op_kernel_context.cc
+++ b/mace/core/op_kernel_context.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/op_kernel_context.h"
+namespace mace {
+OpKernelContext::OpKernelContext(Workspace *ws, Device *device)
+    : device_(device), ws_(ws) {}
+OpKernelContext::~OpKernelContext() = default;
+Device* OpKernelContext::device() {
+  return device_;
+}
+Workspace* OpKernelContext::workspace() {
+  return ws_;
+}
+}  // namespace mace
--- a/mace/core/op_kernel_context.h
+++ b/mace/core/op_kernel_context.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_
+#define MACE_CORE_OP_KERNEL_CONTEXT_H_
+#include "mace/core/device.h"
+#include "mace/core/workspace.h"
+namespace mace {
+class OpKernelContext {
+ public:
+  OpKernelContext(Workspace *ws, Device *device);
+  ~OpKernelContext();
+  Device *device();
+  Workspace *workspace();
+ private:
+  Device *device_;
+  Workspace *ws_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_OP_KERNEL_CONTEXT_H_
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -18,12 +18,15 @@
 #include <vector>
 #include "mace/core/operator.h"
+#include "mace/core/op_kernel_context.h"
 namespace mace {
-OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
+OperatorBase::OperatorBase(const OperatorDef &operator_def,
-    : operator_ws_(ws),
+                           OpKernelContext *context)
-      operator_def_(std::make_shared<OperatorDef>(operator_def)) {}
+    : operator_def_(std::make_shared<OperatorDef>(operator_def)) {
+  MACE_UNUSED(context);
+}
 OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
@@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {}
 std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
    const OperatorDef &operator_def,
-    Workspace *ws,
+    OpKernelContext *context,
    DeviceType type,
    const NetMode mode) const {
  const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
@@ -70,7 +73,7 @@ std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
            .Device(type)
            .TypeConstraint("T", static_cast<DataType>(dtype))
            .Build(),
-        operator_def, ws);
+        operator_def, context);
  } else {
    return nullptr;
  }

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -22,17 +22,17 @@
 #include "mace/core/arg_helper.h"
 #include "mace/core/future.h"
+#include "mace/core/op_kernel_context.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"
 namespace mace {
 class OperatorBase {
 public:
-  explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws);
+  explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *);
  virtual ~OperatorBase() noexcept {}
  template <typename T>
@@ -78,7 +78,6 @@ class OperatorBase {
  inline bool has_debug_def() const { return operator_def_ != nullptr; }
 protected:
-  Workspace *operator_ws_;
  std::shared_ptr<const OperatorDef> operator_def_;
  std::vector<const Tensor *> inputs_;
  std::vector<Tensor *> outputs_;
@@ -89,8 +88,9 @@ class OperatorBase {
 template <DeviceType D, class T>
 class Operator : public OperatorBase {
 public:
-  explicit Operator(const OperatorDef &operator_def, Workspace *ws)
+  explicit Operator(const OperatorDef &operator_def, OpKernelContext *context)
-      : OperatorBase(operator_def, ws) {
+      : OperatorBase(operator_def, context) {
+    Workspace *ws = context->workspace();
    for (const std::string &input_str : operator_def.input()) {
      const Tensor *tensor = ws->GetTensor(input_str);
      MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
@@ -116,7 +116,7 @@ class Operator : public OperatorBase {
          output_type = DataTypeToEnum<T>::v();
        }
        outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
-          output_str, GetDeviceAllocator(D), output_type)));
+          output_str, context->device()->allocator(), output_type)));
      }
    }
  }
@@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
 class OperatorRegistryBase {
 public:
-  typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
+  typedef Registry<std::string,
+                   OperatorBase,
+                   const OperatorDef &,
+                   OpKernelContext *>
      RegistryType;
  OperatorRegistryBase() = default;
  virtual ~OperatorRegistryBase();
  RegistryType *registry() { return &registry_; }
  std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
-                                               Workspace *ws,
+                                               OpKernelContext *context,
                                               DeviceType type,
                                               const NetMode mode) const;
@@ -183,7 +186,7 @@ class OperatorRegistryBase {
 MACE_DECLARE_REGISTRY(OpRegistry,
                      OperatorBase,
                      const OperatorDef &,
-                      Workspace *);
+                      OpKernelContext *);
 #define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
  MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)

--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -22,7 +22,6 @@
 #include <string>
 #include <vector>
-#include "mace/public/mace.h"
 #include "mace/utils/logging.h"
 namespace mace {

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -30,7 +30,6 @@
 #include "public/gemmlowp.h"
 #include "mace/core/macros.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 namespace mace {

--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -18,7 +18,6 @@
 #include <vector>
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 namespace mace {
@@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
                                             CPUAffinityPolicy policy,
                                             bool use_gemmlowp = false);
+class CPURuntime {
+ public:
+  explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
+  ~CPURuntime() = default;
+  inline int num_threads() const {
+    return num_threads_;
+  }
+ private:
+  int num_threads_;
+};
 }  // namespace mace
 #endif  // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/runtime/opencl/gpu_device.h"
+namespace mace {
+GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
+                     KVStorage *opencl_cache_storage,
+                     const GPUPriorityHint priority,
+                     const GPUPerfHint perf,
+                     KVStorage *opencl_binary_storage,
+                     const int num_threads) :
+    CPUDevice(num_threads),
+    runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
+                               opencl_binary_storage, tuner)),
+    allocator_(new OpenCLAllocator(runtime_.get())) {}
+GPUDevice::~GPUDevice() = default;
+OpenCLRuntime* GPUDevice::opencl_runtime() {
+  return runtime_.get();
+}
+Allocator* GPUDevice::allocator() {
+  return allocator_.get();
+}
+DeviceType GPUDevice::device_type() const {
+  return DeviceType::GPU;
+}
+}  // namespace mace
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
+#define MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
+#include <memory>
+#include "mace/core/device_context.h"
+#include "mace/core/device.h"
+#include "mace/core/runtime/opencl/opencl_allocator.h"
+namespace mace {
+class GPUDevice : public CPUDevice {
+ public:
+  GPUDevice(Tuner<uint32_t> *tuner,
+            KVStorage *opencl_cache_storage = nullptr,
+            const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
+            const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
+            KVStorage *opencl_binary_storage = nullptr,
+            const int num_threads = -1);
+  ~GPUDevice();
+  OpenCLRuntime *opencl_runtime() override;
+  Allocator *allocator() override;
+  DeviceType device_type() const override;
+ private:
+  std::unique_ptr<OpenCLRuntime> runtime_;
+  std::unique_ptr<OpenCLAllocator> allocator_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <memory>
 #include "mace/core/runtime/opencl/opencl_allocator.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 namespace mace {
@@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 }
 }  // namespace
-OpenCLAllocator::OpenCLAllocator() {}
+OpenCLAllocator::OpenCLAllocator(
+    OpenCLRuntime *opencl_runtime):
+    opencl_runtime_(opencl_runtime) {}
 OpenCLAllocator::~OpenCLAllocator() {}
 MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
@@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
  }
  cl_int error;
-  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
+  cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(),
                                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                      nbytes, nullptr, &error);
  if (error != CL_SUCCESS) {
@@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
  cl_int error;
  cl::Image2D *cl_image =
-      new cl::Image2D(OpenCLRuntime::Global()->context(),
+      new cl::Image2D(opencl_runtime_->context(),
                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
                      image_shape[0], image_shape[1], 0, nullptr, &error);
  if (error != CL_SUCCESS) {
@@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
 }
 void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
+  VLOG(3) << "Map OpenCL buffer";
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Global()->command_queue();
+  auto queue = opencl_runtime_->command_queue();
  // TODO(heliangliang) Non-blocking call
  cl_int error;
  void *mapped_ptr =
@@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
 void *OpenCLAllocator::MapImage(void *buffer,
                                const std::vector<size_t> &image_shape,
                                std::vector<size_t> *mapped_image_pitch) const {
-  MACE_CHECK(image_shape.size() == 2, "Just support map 2d image");
+  VLOG(3) << "Map OpenCL Image";
+  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
  auto cl_image = static_cast<cl::Image2D *>(buffer);
  std::array<size_t, 3> origin = {0, 0, 0};
  std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
  mapped_image_pitch->resize(2);
  cl_int error;
-  void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
+  void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage(
      *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
      mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
      nullptr, &error);
@@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer,
 }
 void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
+  VLOG(3) << "Unmap OpenCL buffer/Image";
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Global()->command_queue();
+  auto queue = opencl_runtime_->command_queue();
  cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
                                             nullptr, nullptr);
  if (error != CL_SUCCESS) {

--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -15,15 +15,17 @@
 #ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
 #define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
+#include <memory>
 #include <vector>
 #include "mace/core/allocator.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 namespace mace {
 class OpenCLAllocator : public Allocator {
 public:
-  OpenCLAllocator();
+  explicit OpenCLAllocator(OpenCLRuntime *opencl_runtime);
  ~OpenCLAllocator() override;
@@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator {
  void Unmap(void *buffer, void *mapped_ptr) const override;
  bool OnHost() const override;
+ private:
+  OpenCLRuntime *opencl_runtime_;
 };
 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -24,11 +24,9 @@
 #include <vector>
 #include <utility>
-#include "mace/public/mace_runtime.h"
 #include "mace/core/macros.h"
 #include "mace/core/file_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
-#include "mace/public/mace.h"
 #include "mace/utils/tuner.h"
 namespace mace {
@@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector<std::string> &paths) {
 const char *kOpenCLPlatformInfoKey =
    "mace_opencl_precompiled_platform_info_key";
-const char *kPrecompiledProgramFileName =
-    "mace_cl_compiled_program.bin";
 }  // namespace
 void OpenCLProfilingTimer::StartTiming() {}
 void OpenCLProfilingTimer::StopTiming() {
-  OpenCLRuntime::Global()->command_queue().finish();
+  runtime_->command_queue().finish();
  start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
  stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
 }
@@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() {
  accumulated_micros_ = 0;
 }
-GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL;
+OpenCLRuntime::OpenCLRuntime(
-GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
+    KVStorage *cache_storage,
-    GPUPriorityHint::PRIORITY_DEFAULT;
+    const GPUPriorityHint priority_hint,
-std::string
+    const GPUPerfHint perf_hint,
-    OpenCLRuntime::kPrecompiledBinaryPath = "";  // NOLINT(runtime/string)
+    KVStorage *precompiled_binary_storage,
+    Tuner<uint32_t> *tuner):
-OpenCLRuntime *OpenCLRuntime::Global() {
+    cache_storage_(cache_storage),
-  static OpenCLRuntime runtime;
+    precompiled_binary_storage_(precompiled_binary_storage),
-  return &runtime;
+    tuner_(tuner),
-}
-void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
-                              GPUPriorityHint gpu_priority_hint) {
-  OpenCLRuntime::kGPUPerfHint = gpu_perf_hint;
-  OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
-}
-void OpenCLRuntime::ConfigureOpenCLBinaryPath(
-    const std::vector<std::string> &paths) {
-  OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths);
-  if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
-    LOG(WARNING) << "There is no precompiled OpenCL binary file in "
-                 << MakeString(paths);
-  }
-}
-OpenCLRuntime::OpenCLRuntime():
-    precompiled_binary_storage_(nullptr),
-    cache_storage_(nullptr),
    is_opencl_avaliable_(false),
    is_profiling_enabled_(false),
    opencl_version_(CL_VER_UNKNOWN),
@@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime():
  cl_command_queue_properties properties = 0;
  const char *profiling = getenv("MACE_OPENCL_PROFILING");
-  if (Tuner<uint32_t>::Get()->IsTuning() ||
+  if (IsTuning() ||
      (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
    properties |= CL_QUEUE_PROFILING_ENABLE;
    is_profiling_enabled_ = true;
@@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime():
    std::vector<cl_context_properties> context_properties;
    context_properties.reserve(5);
    GetAdrenoContextProperties(&context_properties,
-                               OpenCLRuntime::kGPUPerfHint,
+                               perf_hint,
-                               OpenCLRuntime::kGPUPriorityHint);
+                               priority_hint);
    context_ = std::shared_ptr<cl::Context>(
        new cl::Context({*device_}, context_properties.data(),
                        nullptr, nullptr, &err));
@@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime():
    return;
  }
-  extern std::shared_ptr<KVStorageFactory> kStorageFactory;
  std::string cached_binary_platform_info;
-  if (kStorageFactory != nullptr) {
+  if (cache_storage_ != nullptr) {
-    cache_storage_ =
-        kStorageFactory->CreateStorage(kPrecompiledProgramFileName);
    if (cache_storage_->Load() != 0) {
      LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. "
                   << "Please make sure the storage directory exist "
@@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime():
  }
  if (cached_binary_platform_info != platform_info_) {
-    if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
+    if (precompiled_binary_storage_ == nullptr) {
-      precompiled_binary_storage_.reset(
+      VLOG(1) << "There is no precompiled OpenCL binary in"
-          new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
+          " all OpenCL binary paths.";
+    } else {
      if (precompiled_binary_storage_->Load() != 0) {
        LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
                     << "Please make sure the storage directory exist "
@@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
+Tuner<uint32_t> *OpenCLRuntime::tuner() { return tuner_; }
 uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
  return device_gloabl_mem_cache_size_;
 }

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -22,11 +22,12 @@
 #include <string>
 #include <vector>
+#include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
+#include "mace/utils/tuner.h"
 namespace mace {
@@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error);
    return MaceStatus::MACE_OUT_OF_RESOURCES;               \
  }
-class OpenCLProfilingTimer : public Timer {
- public:
-  explicit OpenCLProfilingTimer(const cl::Event *event)
-      : event_(event), accumulated_micros_(0) {}
-  void StartTiming() override;
-  void StopTiming() override;
-  void AccumulateTiming() override;
-  void ClearTiming() override;
-  double ElapsedMicros() override;
-  double AccumulatedMicros() override;
- private:
-  const cl::Event *event_;
-  double start_nanos_;
-  double stop_nanos_;
-  double accumulated_micros_;
-};
 class OpenCLRuntime {
 public:
-  static OpenCLRuntime *Global();
+  OpenCLRuntime(
-  static void Configure(GPUPerfHint, GPUPriorityHint);
+      KVStorage *cache_storage = nullptr,
-  static void ConfigureOpenCLBinaryPath(const std::vector<std::string> &paths);
+      const GPUPriorityHint priority_hint = GPUPriorityHint::PRIORITY_NORMAL,
+      const GPUPerfHint perf_hint = GPUPerfHint::PERF_NORMAL,
+      KVStorage *precompiled_binary_storage = nullptr,
+      Tuner<uint32_t> *tuner = nullptr);
+  ~OpenCLRuntime();
+  OpenCLRuntime(const OpenCLRuntime &) = delete;
+  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
  cl::Context &context();
  cl::Device &device();
@@ -91,6 +80,7 @@ class OpenCLRuntime {
  const std::string platform_info() const;
  uint64_t device_global_mem_cache_size() const;
  uint32_t device_compute_units() const;
+  Tuner<uint32_t> *tuner();
  bool is_opencl_avaliable();
  void GetCallStats(const cl::Event &event, CallStats *stats);
@@ -112,11 +102,6 @@ class OpenCLRuntime {
  void SaveBuiltCLProgram();
 private:
-  OpenCLRuntime();
-  ~OpenCLRuntime();
-  OpenCLRuntime(const OpenCLRuntime &) = delete;
-  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
  bool BuildProgram(const std::string &program_file_name,
                    const std::string &binary_file_name,
                    const std::string &build_options,
@@ -137,10 +122,13 @@ class OpenCLRuntime {
  OpenCLVersion ParseDeviceVersion(const std::string &device_version);
 private:
-  std::unique_ptr<KVStorage> precompiled_binary_storage_;
+  KVStorage *cache_storage_;
-  std::unique_ptr<KVStorage> cache_storage_;
+  KVStorage *precompiled_binary_storage_;
+  Tuner<uint32_t> *tuner_;
  bool is_opencl_avaliable_;
  bool is_profiling_enabled_;
+  OpenCLVersion opencl_version_;
+  GPUType gpu_type_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.
  std::shared_ptr<cl::Context> context_;
@@ -149,18 +137,30 @@ class OpenCLRuntime {
  std::map<std::string, cl::Program> built_program_map_;
  std::mutex program_build_mutex_;
  std::string platform_info_;
-  OpenCLVersion opencl_version_;
  std::string precompiled_binary_platform_info_;
  bool out_of_range_check_;
  uint64_t device_gloabl_mem_cache_size_;
  uint32_t device_compute_units_;
-  GPUType gpu_type_;
-  static GPUPerfHint kGPUPerfHint;
-  static GPUPriorityHint kGPUPriorityHint;
-  static std::string kPrecompiledBinaryPath;
 };
+class OpenCLProfilingTimer : public Timer {
+ public:
+  OpenCLProfilingTimer(OpenCLRuntime *runtime, const cl::Event *event)
+      : runtime_(runtime), event_(event), accumulated_micros_(0) {}
+  void StartTiming() override;
+  void StopTiming() override;
+  void AccumulateTiming() override;
+  void ClearTiming() override;
+  double ElapsedMicros() override;
+  double AccumulatedMicros() override;
+ private:
+  OpenCLRuntime *runtime_;
+  const cl::Event *event_;
+  double start_nanos_;
+  double stop_nanos_;
+  double accumulated_micros_;
+};
 }  // namespace mace
 #endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -25,7 +25,6 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
 #endif
-#include "mace/public/mace.h"
 #include "mace/utils/logging.h"
 #ifdef MACE_ENABLE_NEON
@@ -38,10 +37,10 @@
 namespace mace {
 #define MACE_SINGLE_ARG(...) __VA_ARGS__
-#define MACE_CASE(TYPE, STATEMENTS)             \
+#define MACE_CASE(TYPE, STATEMENTS)   \
  case DataTypeToEnum<TYPE>::value: { \
    typedef TYPE T;                   \
-    STATEMENTS;                            \
+    STATEMENTS;                       \
    break;                            \
  }
@@ -137,7 +136,7 @@ class Tensor {
    buffer_ = &buffer_slice_;
  }
-  Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}
+  Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}
  ~Tensor() {
    if (is_buffer_owner_ && buffer_ != nullptr) {
@@ -270,7 +269,7 @@ class Tensor {
    image_shape_ = image_shape;
    if (buffer_ == nullptr) {
      MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Image();
+      buffer_ = new Image(allocator_);
      return buffer_->Allocate(image_shape, dtype_);
    } else {
      MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");

--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -16,15 +16,10 @@
 #include "gflags/gflags.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
-DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
 DEFINE_int32(cpu_affinity_policy, 1,
             "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
@@ -43,10 +38,6 @@ int main(int argc, char **argv) {
    LOG(WARNING) << "Set openmp or cpu affinity failed.";
  }
-  mace::OpenCLRuntime::Configure(
-      static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint),
-      static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint));
  mace::testing::Benchmark::Run(FLAGS_filter.c_str());
  return 0;
 }
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "mace/core/workspace.h"
+#include <memory>
 #include <string>
 #include <vector>
 #include <unordered_set>
@@ -21,8 +24,6 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif
-#include "mace/core/workspace.h"
-#include "mace/utils/timer.h"
 namespace mace {
@@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
 }
 }  // namespace
-Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer(
+Workspace::Workspace() :
-  GetDeviceAllocator(DeviceType::CPU))) {}
+    host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
@@ -74,7 +75,7 @@ std::vector<std::string> Workspace::Tensors() const {
 }
 MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
-                                      DeviceType type,
+                                      Device *device,
                                      const unsigned char *model_data) {
  MACE_LATENCY_LOGGER(1, "Load model tensors");
  index_t model_data_size = 0;
@@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
  }
  VLOG(3) << "Model data size: " << model_data_size;
+  const DeviceType device_type = device->device_type();
  if (model_data_size > 0) {
 #ifdef MACE_ENABLE_OPENCL
-    if (type == DeviceType::GPU &&
+    if (device_type == DeviceType::GPU &&
-        OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
+        device->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
            static_cast<uint64_t>(model_data_size)) {
      for (auto &const_tensor : net_def.tensors()) {
        MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
@@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        }
        std::unique_ptr<Tensor> tensor(
-            new Tensor(GetDeviceAllocator(type),
+            new Tensor(device->allocator(),
                       const_tensor.data_type(), true));
        tensor->Resize(dims);
@@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 #else
    {
 #endif
-      if (type == DeviceType::CPU) {
+      if (device_type == DeviceType::CPU) {
        tensor_buffer_ = std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(type),
+            new Buffer(device->allocator(),
                       const_cast<unsigned char*>(model_data),
                       model_data_size));
      } else {
        tensor_buffer_ = std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(type)));
+            new Buffer(device->allocator()));
        MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
        tensor_buffer_->Map(nullptr);
        tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
@@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
    }
  }
-  if (type == DeviceType::CPU || type == DeviceType::GPU) {
+  if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
-    MaceStatus status = CreateOutputTensorBuffer(net_def, type);
+    MaceStatus status = CreateOutputTensorBuffer(net_def, device);
    if (status != MaceStatus::MACE_SUCCESS) return status;
  }
-  if (type == DeviceType::CPU && net_def.has_quantize_info()) {
+  if (device_type == DeviceType::CPU && net_def.has_quantize_info()) {
    for (const auto
          &activation_info: net_def.quantize_info().activation_info()) {
      if (HasTensor(activation_info.tensor_name())) {
@@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 }
 MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                               DeviceType device_type) {
+                                               Device *device) {
+  DeviceType device_type = device->device_type();
  DataType dtype = DataType::DT_INVALID;
  if (net_def.mem_arena().mem_block_size() > 0) {
    // We use the data type of the first op with mem id,
@@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
              << ", memory type: " << mem_block.mem_type();
      if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(DeviceType::CPU)));
+            new Buffer(GetCPUAllocator()));
        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
            mem_block.x() * GetEnumTypeSize(dtype)
                + MACE_EXTRA_BUFFER_PAD_SIZE));
@@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                                          std::move(tensor_buf));
      } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
        std::unique_ptr<BufferBase> image_buf(
-            new Image());
+            new Image(device->allocator()));
        MACE_RETURN_IF_ERROR(image_buf->Allocate(
            {mem_block.x(), mem_block.y()}, dtype));
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                          std::move(image_buf));
      } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU)));
+            new Buffer(device->allocator()));
        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
            mem_block.x() * GetEnumTypeSize(dtype)));
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
@@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                op, "T", static_cast<int>(DT_FLOAT)));
          }
          CreateTensor(op.output(i),
-                       GetDeviceAllocator(device_type),
+                       device->allocator(),
                       output_type);
        }
      }
@@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() {
 }
 void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
-                                      const unsigned char *model_data) {
+                                      const unsigned char *model_data,
+                                      Allocator *alloc) {
  for (auto &const_tensor : net_def.tensors()) {
    auto iter = tensor_map_.find(const_tensor.name());
    if (iter->second->unused()) {
@@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
        dims.push_back(d);
      }
      std::unique_ptr<Tensor> tensor(
-          new Tensor(GetDeviceAllocator(DeviceType::GPU),
+          new Tensor(alloc, const_tensor.data_type()));
-                     const_tensor.data_type()));
      tensor->Resize(dims);
      MACE_CHECK(tensor->size() == const_tensor.data_size(),
                 "Tensor's data_size not equal with the shape");

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <memory>
+#include "mace/core/device.h"
 #include "mace/core/preallocated_pooled_allocator.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
@@ -48,7 +49,7 @@ class Workspace {
  std::vector<std::string> Tensors() const;
  MaceStatus LoadModelTensor(const NetDef &net_def,
-                             DeviceType type,
+                             Device *device,
                             const unsigned char *model_data);
  ScratchBuffer *GetScratchBuffer(DeviceType device_type);
@@ -56,11 +57,14 @@ class Workspace {
  void RemoveUnusedBuffer();
  void RemoveAndReloadBuffer(const NetDef &net_def,
-                             const unsigned char *model_data);
+                             const unsigned char *model_data,
+                             Allocator *alloc);
 private:
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
-                                      DeviceType device_type);
+                                      Device *device);
+  Device *device_;
  TensorMap tensor_map_;

--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java
@@ -37,15 +37,13 @@ public class AppModel {
        mJniThread = new Handler(thread.getLooper());
    }
-    public void maceMobilenetSetAttrs(final InitData initData) {
+    public void maceMobilenetCreateGPUContext(final InitData initData) {
        mJniThread.post(new Runnable() {
            @Override
            public void run() {
-                int result = JniMaceUtils.maceMobilenetSetAttrs(
+                int result = JniMaceUtils.maceMobilenetCreateGPUContext(
-                        initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
+                        initData.getStoragePath());
-                        initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
+                Log.i("APPModel", "maceMobilenetCreateGPUContext result = " + result);
-                        initData.getKernelPath());
-                Log.i("APPModel", "maceMobilenetSetAttrs result = " + result);
            }
        });
    }
@@ -54,7 +52,10 @@ public class AppModel {
        mJniThread.post(new Runnable() {
            @Override
            public void run() {
-                int result = JniMaceUtils.maceMobilenetCreateEngine(initData.getModel(), initData.getDevice());
+                int result = JniMaceUtils.maceMobilenetCreateEngine(
+                        initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
+                        initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
+                        initData.getModel(), initData.getDevice());
                Log.i("APPModel", "maceMobilenetCreateEngine result = " + result);
                if (result == -1) {

--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java
@@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap
    }
    private void initJni() {
-        AppModel.instance.maceMobilenetSetAttrs(initData);
+        AppModel.instance.maceMobilenetCreateGPUContext(initData);
        AppModel.instance.maceMobilenetCreateEngine(initData, this);
    }

--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java
@@ -29,7 +29,7 @@ public class InitData {
    private int cpuAffinityPolicy;
    private int gpuPerfHint;
    private int gpuPriorityHint;
-    private String kernelPath = "";
+    private String storagePath = "";
    public InitData() {
        model = MODELS[0];
@@ -38,8 +38,8 @@ public class InitData {
        gpuPerfHint = 3;
        gpuPriorityHint = 3;
        device = DEVICES[0];
-        kernelPath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
+        storagePath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
-        File file = new File(kernelPath);
+        File file = new File(storagePath);
        if (!file.exists()) {
            file.mkdir();
        }
@@ -94,11 +94,11 @@ public class InitData {
        this.gpuPriorityHint = gpuPriorityHint;
    }
-    public String getKernelPath() {
+    public String getStoragePath() {
-        return kernelPath;
+        return storagePath;
    }
-    public void setKernelPath(String kernelPath) {
+    public void setStoragePath(String storagePath) {
-        this.kernelPath = kernelPath;
+        this.storagePath = storagePath;
    }
 }
--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
@@ -26,7 +26,6 @@
 #include <numeric>
 #include "src/main/cpp/include/mace/public/mace.h"
-#include "src/main/cpp/include/mace/public/mace_runtime.h"
 #include "src/main/cpp/include/mace/public/mace_engine_factory.h"
 namespace {
@@ -39,8 +38,8 @@ struct ModelInfo {
 };
 struct MaceContext {
+  std::shared_ptr<mace::GPUContext> gpu_context;
  std::shared_ptr<mace::MaceEngine> engine;
-  std::shared_ptr<mace::KVStorageFactory> storage_factory;
  std::string model_name;
  mace::DeviceType device_type = mace::DeviceType::CPU;
  std::map<std::string, ModelInfo> model_infos = {
@@ -72,48 +71,65 @@ MaceContext& GetMaceContext() {
 }  // namespace
-JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs(
+JNIEXPORT jint JNICALL
-    JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
+Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(
-    jint gpu_perf_hint, jint gpu_priority_hint, jstring kernel_path) {
+    JNIEnv *env, jclass thisObj, jstring storage_path) {
  MaceContext &mace_context = GetMaceContext();
+  // DO NOT USE tmp directory.
+  // Please use APP's own directory and make sure the directory exists.
+  const char *storage_path_ptr = env->GetStringUTFChars(storage_path, nullptr);
+  if (storage_path_ptr == nullptr) return JNI_ERR;
+  const std::string storage_file_path(storage_path_ptr);
+  env->ReleaseStringUTFChars(storage_path, storage_path_ptr);
-  mace::MaceStatus status;
+  mace_context.gpu_context = mace::GPUContextBuilder()
-      // openmp
+      .SetStoragePath(storage_file_path)
-  status = mace::SetOpenMPThreadPolicy(
+      .Finalize();
-      omp_num_threads,
-      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "openmp result: %d, threads: %d, cpu: %d",
-                      status, omp_num_threads, cpu_affinity_policy);
-  //  gpu
-  mace::SetGPUHints(
-      static_cast<mace::GPUPerfHint>(gpu_perf_hint),
-      static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "gpu perf: %d, priority: %d",
-                      gpu_perf_hint, gpu_priority_hint);
-  //  opencl cache
-  const char *kernel_path_ptr = env->GetStringUTFChars(kernel_path, nullptr);
-  if (kernel_path_ptr == nullptr) return JNI_ERR;
-  const std::string kernel_file_path(kernel_path_ptr);
-  mace_context.storage_factory.reset(
-      new mace::FileStorageFactory(kernel_file_path));
-  mace::SetKVStorageFactory(mace_context.storage_factory);
-  env->ReleaseStringUTFChars(kernel_path, kernel_path_ptr);
  return JNI_OK;
 }
 JNIEXPORT jint JNICALL
 Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
-    JNIEnv *env, jclass thisObj, jstring model_name_str, jstring device) {
+    JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
+    jint gpu_perf_hint, jint gpu_priority_hint,
+    jstring model_name_str, jstring device) {
  MaceContext &mace_context = GetMaceContext();
+  // get device
+  const char *device_ptr = env->GetStringUTFChars(device, nullptr);
+  if (device_ptr == nullptr) return JNI_ERR;
+  mace_context.device_type = ParseDeviceType(device_ptr);
+  env->ReleaseStringUTFChars(device, device_ptr);
+  // create MaceEngineConfig
+  mace::MaceStatus status;
+  mace::MaceEngineConfig config(mace_context.device_type);
+  status = config.SetCPUThreadPolicy(
+      omp_num_threads,
+      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
+  if (status != mace::MACE_SUCCESS) {
+    __android_log_print(ANDROID_LOG_ERROR,
+                        "image_classify attrs",
+                        "openmp result: %d, threads: %d, cpu: %d",
+                        status, omp_num_threads, cpu_affinity_policy);
+  }
+  if (mace_context.device_type == mace::DeviceType::GPU) {
+    config.SetGPUContext(mace_context.gpu_context);
+    config.SetGPUHints(
+        static_cast<mace::GPUPerfHint>(gpu_perf_hint),
+        static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
+    __android_log_print(ANDROID_LOG_INFO,
+                        "image_classify attrs",
+                        "gpu perf: %d, priority: %d",
+                        gpu_perf_hint, gpu_priority_hint);
+  }
+  __android_log_print(ANDROID_LOG_INFO,
+                      "image_classify attrs",
+                      "device: %d",
+                      mace_context.device_type);
  //  parse model name
  const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr);
  if (model_name_ptr == nullptr) return JNI_ERR;
@@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
  std::vector<std::string> input_names = {model_info_iter->second.input_name};
  std::vector<std::string> output_names = {model_info_iter->second.output_name};
-  // get device
-  const char *device_ptr = env->GetStringUTFChars(device, nullptr);
-  if (device_ptr == nullptr) return JNI_ERR;
-  mace_context.device_type = ParseDeviceType(device_ptr);
-  env->ReleaseStringUTFChars(device, device_ptr);
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "device: %d",
-                      mace_context.device_type);
  mace::MaceStatus create_engine_status =
      CreateMaceEngineFromCode(mace_context.model_name,
                               std::string(),
                               input_names,
                               output_names,
-                               mace_context.device_type,
+                               config,
                               &mace_context.engine);
-  __android_log_print(ANDROID_LOG_ERROR,
+  __android_log_print(ANDROID_LOG_INFO,
                      "image_classify attrs",
                      "create result: %d",
                      create_engine_status);

--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h
@@ -24,11 +24,13 @@ extern "C" {
 #endif
 /*
 * Class:     com_xiaomi_mace_JniMaceUtils
- * Method:    maceMobilenetSetAttrs
+ * Method:    maceMobilenetCreateGPUContext
 * Signature: (Ljava/lang/String;IIIILjava/lang/String;)I
 */
-JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
+JNIEXPORT jint JNICALL
-  (JNIEnv *, jclass, jint, jint, jint, jint, jstring);
+Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(JNIEnv *,
+                                                                jclass,
+                                                                jstring);
 /*
 * Class:     com_xiaomi_mace_JniMaceUtils
@@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
 */
 JNIEXPORT jint JNICALL
 Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine
-  (JNIEnv *, jclass, jstring, jstring);
+  (JNIEnv *, jclass, jint, jint, jint, jint, jstring, jstring);
 /*
 * Class:     com_xiaomi_mace_JniMaceUtils

--- a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java
+++ b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java
@@ -20,9 +20,9 @@ public class JniMaceUtils {
        System.loadLibrary("mace_mobile_jni");
    }
-    public static native int maceMobilenetSetAttrs(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String kernelPath);
+    public static native int maceMobilenetCreateGPUContext(String storagePath);
-    public static native int maceMobilenetCreateEngine(String model, String device);
+    public static native int maceMobilenetCreateEngine(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String model, String device);
    public static native float[] maceMobilenetClassify(float[] input);

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -21,7 +21,6 @@
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 // if convert model to code.
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -157,40 +156,40 @@ bool RunModel(const std::vector<std::string> &input_names,
              const std::vector<std::vector<int64_t>> &output_shapes) {
  // load model
  DeviceType device_type = ParseDeviceType(FLAGS_device);
-  // config runtime
+  // configuration
-  mace::SetOpenMPThreadPolicy(
+  // Detailed information please see mace.h
+  MaceStatus status;
+  MaceEngineConfig config(device_type);
+  status = config.SetCPUThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+  if (status != MACE_SUCCESS) {
+    std::cerr << "Set openmp or cpu affinity failed." << std::endl;
+  }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
  if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
+    // DO NOT USE tmp directory.
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+    // Please use APP's own directory and make sure the directory exists.
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
+    const std::string storage_path =
-    // Just call once. (Not thread-safe)
+        std::string(storage_path_ptr == nullptr ?
-    // Set paths of Generated OpenCL Compiled Kernel Binary file
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
-    // if you build gpu library of specific soc.
-    // Using OpenCL binary will speed up the initialization.
-    // OpenCL binary is corresponding to the OpenCL Driver version,
-    // you should update the binary when OpenCL Driver changed.
    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
 #endif  // MACE_ENABLE_OPENCL
-  // DO NOT USE tmp directory.
-  // Please use APP's own directory and make sure the directory exists.
-  // Just call once
-  const std::string internal_storage_path =
-      "/data/local/tmp/mace_run/interior";
-  // Config internal kv storage factory.
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(internal_storage_path));
-  SetKVStorageFactory(storage_factory);
  // Create Engine
  std::shared_ptr<mace::MaceEngine> engine;
  MaceStatus create_engine_status;
@@ -204,7 +203,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                               FLAGS_model_data_file,
                               input_names,
                               output_names,
-                               device_type,
+                               config,
                               &engine);
 #else
  std::vector<unsigned char> model_pb_data;
@@ -216,7 +215,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                                FLAGS_model_data_file,
                                input_names,
                                output_names,
-                                device_type,
+                                config,
                                &engine);
 #endif

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -126,10 +127,14 @@ template <DeviceType D, typename T>
 class ActivationFunctor;
 template <>
-class ActivationFunctor<DeviceType::CPU, float> {
+class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
 public:
-  ActivationFunctor(ActivationType type, float relux_max_limit)
+  ActivationFunctor(OpKernelContext *context,
-      : activation_(type), relux_max_limit_(relux_max_limit) {}
+                    ActivationType type,
+                    float relux_max_limit)
+      : OpKernel(context),
+        activation_(type),
+        relux_max_limit_(relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *alpha,
@@ -159,10 +164,14 @@ class ActivationFunctor<DeviceType::CPU, float> {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class ActivationFunctor<DeviceType::GPU, T> {
+class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
 public:
-  ActivationFunctor(ActivationType type, T relux_max_limit)
+  ActivationFunctor(OpKernelContext *context,
-      : activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}
+                    ActivationType type,
+                    T relux_max_limit)
+      : OpKernel(context),
+        activation_(type),
+        relux_max_limit_(static_cast<T>(relux_max_limit)) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *alpha,

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -24,6 +24,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -35,10 +36,11 @@ namespace kernels {
 constexpr int kCostPerGroup = 1024;
 template <DeviceType D, typename T>
-struct AddNFunctor {
+struct AddNFunctor : OpKernel {
+  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
-                  Tensor *output_tensor,
+                        Tensor *output_tensor,
-                  StatsFuture *future) {
+                        StatsFuture *future) {
    MACE_UNUSED(future);
    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
    index_t size = output_tensor->size();
@@ -95,7 +97,8 @@ struct AddNFunctor {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct AddNFunctor<DeviceType::GPU, T> {
+struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
                  Tensor *output_tensor,
                  StatsFuture *future);

--- a/mace/kernels/argmax.h
+++ b/mace/kernels/argmax.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -30,7 +31,8 @@ namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
-struct ArgMaxFunctor {
+struct ArgMaxFunctor : OpKernel {
+  explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *axis,
                        Tensor *output,

--- a/mace/kernels/arm/conv_winograd_test.cc
+++ b/mace/kernels/arm/conv_winograd_test.cc
@@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) {
  index_t filter_size = 3 * 3 * in_channels * out_channels;
  index_t output_size = batch * out_channels * out_height * out_width;
-  Tensor input;
+  Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor filter;
+  Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output;
+  Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output_ref;
+  Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);
  input.Resize({batch, in_channels, in_height, in_width});
  filter.Resize({out_channels, in_channels, 3, 3});

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -33,11 +33,13 @@
 namespace mace {
 namespace kernels {
-struct BatchNormFunctorBase {
+struct BatchNormFunctorBase : OpKernel {
-  BatchNormFunctorBase(bool folded_constant,
+  BatchNormFunctorBase(OpKernelContext *context,
+                       bool folded_constant,
                       const ActivationType activation,
                       const float relux_max_limit)
-    : folded_constant_(folded_constant),
+    : OpKernel(context),
+      folded_constant_(folded_constant),
      activation_(activation),
      relux_max_limit_(relux_max_limit) {}
@@ -51,10 +53,14 @@ struct BatchNormFunctor;
 template<>
 struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
-  BatchNormFunctor(const bool folded_constant,
+  BatchNormFunctor(OpKernelContext *context,
+                   const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(context,
+                             folded_constant,
+                             activation,
+                             relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                  const Tensor *scale,
@@ -132,10 +138,14 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
-  BatchNormFunctor(const bool folded_constant,
+  BatchNormFunctor(OpKernelContext *context,
+                   const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(context,
+                             folded_constant,
+                             activation,
+                             relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                  const Tensor *scale,
                  const Tensor *offset,

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -30,10 +31,10 @@
 namespace mace {
 namespace kernels {
-struct BiasAddFunctorBase {
+struct BiasAddFunctorBase : OpKernel {
-  explicit BiasAddFunctorBase(const DataFormat data_format) {
+  BiasAddFunctorBase(OpKernelContext *context,
-    data_format_ = data_format;
+                     const DataFormat data_format)
-  }
+      : OpKernel(context), data_format_(data_format) {}
  DataFormat data_format_;
 };
@@ -43,8 +44,9 @@ struct BiasAddFunctor;
 template <>
 struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
-  explicit BiasAddFunctor(const DataFormat data_format)
+  BiasAddFunctor(OpKernelContext *context,
-      : BiasAddFunctorBase(data_format) {}
+                 const DataFormat data_format)
+      : BiasAddFunctorBase(context, data_format) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
@@ -96,8 +98,8 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
-  explicit BiasAddFunctor(const DataFormat data_format)
+  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
-      : BiasAddFunctorBase(data_format) {}
+      : BiasAddFunctorBase(context, data_format) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
                        Tensor *output,

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -20,21 +20,24 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/opencl/common.h"
 namespace mace {
 namespace kernels {
-struct BufferToImageFunctorBase {
+struct BufferToImageFunctorBase : OpKernel {
-  explicit BufferToImageFunctorBase(const int wino_blk_size)
+  explicit BufferToImageFunctorBase(OpKernelContext *context,
-    : wino_blk_size_(wino_blk_size) {}
+                                    const int wino_blk_size)
+    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };
 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(const int wino_blk_size)
+  explicit BufferToImageFunctor(OpKernelContext *context,
-    : BufferToImageFunctorBase(wino_blk_size) {}
+                                const int wino_blk_size)
+      : BufferToImageFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
 template <typename T>
 struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(const int wino_blk_size)
+  explicit BufferToImageFunctor(OpKernelContext *context,
-      : BufferToImageFunctorBase(wino_blk_size) {}
+                                const int wino_blk_size)
+      : BufferToImageFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -20,13 +20,15 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 namespace mace {
 namespace kernels {
 template<DeviceType D, typename T>
-struct ChannelShuffleFunctor {
+struct ChannelShuffleFunctor : OpKernel {
-  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
+      : OpKernel(context), groups_(groups) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -70,8 +72,9 @@ struct ChannelShuffleFunctor {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct ChannelShuffleFunctor<DeviceType::GPU, T> {
+struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
+      : OpKernel(context), groups_(groups) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -30,15 +31,17 @@
 namespace mace {
 namespace kernels {
-struct ConcatFunctorBase {
+struct ConcatFunctorBase : OpKernel {
-  explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
+  ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
+      : OpKernel(context), axis_(axis) {}
  int32_t axis_;
 };
 template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
-  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  ConcatFunctor(OpKernelContext *context, const int32_t axis)
+      : ConcatFunctorBase(context, axis) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,
@@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
-  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  ConcatFunctor(OpKernelContext *context, const int32_t axis)
+      : ConcatFunctorBase(context, axis) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -42,14 +42,16 @@
 namespace mace {
 namespace kernels {
-struct Conv2dFunctorBase {
+struct Conv2dFunctorBase : OpKernel {
-  Conv2dFunctorBase(const int *strides,
+  Conv2dFunctorBase(OpKernelContext *context,
+                    const int *strides,
                    const Padding &padding_type,
                    const std::vector<int> &paddings,
                    const int *dilations,
                    const ActivationType activation,
                    const float relux_max_limit)
-    : strides_(strides),
+    : OpKernel(context),
+      strides_(strides),
      padding_type_(padding_type),
      paddings_(paddings),
      dilations_(dilations),
@@ -69,7 +71,8 @@ struct Conv2dFunctor;
 template<>
 struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,
                const int *dilations,
@@ -77,12 +80,14 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-    : Conv2dFunctorBase(strides,
+    : Conv2dFunctorBase(context,
+                        strides,
                        padding_type,
                        paddings,
                        dilations,
                        activation,
                        relux_max_limit),
+      transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
      is_filter_transformed_(is_filter_transformed),
      scratch_(scratch) {}
@@ -721,7 +726,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 template<>
 struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,
                const int *dilations,
@@ -729,7 +735,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-      : Conv2dFunctorBase(strides,
+      : Conv2dFunctorBase(context,
+                          strides,
                          padding_type,
                          paddings,
                          dilations,
@@ -949,7 +956,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,
                const int *dilations,
@@ -957,7 +965,8 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-    : Conv2dFunctorBase(strides,
+    : Conv2dFunctorBase(context,
+                        strides,
                        padding_type,
                        paddings,
                        dilations,
@@ -968,10 +977,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
  }
  MaceStatus operator()(const Tensor *input,
-                  const Tensor *filter,
+                        const Tensor *filter,
-                  const Tensor *bias,
+                        const Tensor *bias,
-                  Tensor *output,
+                        Tensor *output,
-                  StatsFuture *future);
+                        StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;

--- a/mace/kernels/crop.h
+++ b/mace/kernels/crop.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -30,10 +31,12 @@
 namespace mace {
 namespace kernels {
-struct CropFunctorBase {
+struct CropFunctorBase : OpKernel {
-  CropFunctorBase(const int axis,
+  CropFunctorBase(OpKernelContext *context,
+                  const int axis,
                  const std::vector<int> &offset)
-      : axis_(axis),
+      : OpKernel(context),
+        axis_(axis),
        offset_(offset) {}
  const int axis_;
@@ -42,8 +45,10 @@ struct CropFunctorBase {
 template <DeviceType D, typename T>
 struct CropFunctor : CropFunctorBase {
-  CropFunctor(const int axis, const std::vector<int> &offset)
+  CropFunctor(OpKernelContext *context,
-      : CropFunctorBase(axis, offset) {}
+              const int axis,
+              const std::vector<int> &offset)
+      : CropFunctorBase(context, axis, offset) {}
  void crop_copy(const T* input_data, T* output_data,
                 const std::vector<index_t> &input_shape,
@@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
-  CropFunctor(const int axis, const std::vector<int> &offset)
+  CropFunctor(OpKernelContext *context,
-  : CropFunctorBase(axis, offset) {}
+              const int axis,
+              const std::vector<int> &offset)
+      : CropFunctorBase(context, axis, offset) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output,
+                        Tensor *output,
-                  StatsFuture *future);
+                        StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::unique_ptr<BufferBase> kernel_error_;

--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input,
 }
 }  // namespace deconv
-struct Deconv2dFunctorBase {
+struct Deconv2dFunctorBase : OpKernel {
-  Deconv2dFunctorBase(const std::vector<int> &strides,
+  Deconv2dFunctorBase(OpKernelContext *context,
+                      const std::vector<int> &strides,
                      const Padding &padding_type,
                      const std::vector<int> &paddings,
                      const std::vector<index_t> &output_shape,
                      const ActivationType activation,
                      const float relux_max_limit)
-      : strides_(strides),
+      : OpKernel(context),
+        strides_(strides),
        padding_type_(padding_type),
        paddings_(paddings),
        output_shape_(output_shape),
@@ -210,13 +212,15 @@ struct Deconv2dFunctorBase {
 template <DeviceType D, typename T>
 struct Deconv2dFunctor : Deconv2dFunctorBase {
-  Deconv2dFunctor(const std::vector<int> &strides,
+  Deconv2dFunctor(OpKernelContext *context,
+                  const std::vector<int> &strides,
                  const Padding &padding_type,
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
                  const float relux_max_limit)
-      : Deconv2dFunctorBase(strides,
+      : Deconv2dFunctorBase(context,
+                            strides,
                            padding_type,
                            paddings,
                            output_shape,
@@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
-  Deconv2dFunctor(const std::vector<int> &strides,
+  Deconv2dFunctor(OpKernelContext *context,
+                  const std::vector<int> &strides,
                  const Padding &padding_type,
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
                  const float relux_max_limit)
-      : Deconv2dFunctorBase(strides,
+      : Deconv2dFunctorBase(context,
+                            strides,
                            padding_type,
                            paddings,
                            output_shape,

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -19,6 +19,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -29,9 +30,11 @@ namespace mace {
 namespace kernels {
 template<DeviceType D, typename T>
-struct DepthToSpaceOpFunctor {
+struct DepthToSpaceOpFunctor : OpKernel {
-  explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
+  DepthToSpaceOpFunctor(OpKernelContext *context,
-      : block_size_(block_size), d2s_(d2s) {}
+                        const int block_size,
+                        bool d2s)
+      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future) {
@@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
+struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
-  DepthToSpaceOpFunctor(const int block_size, bool d2s)
+  DepthToSpaceOpFunctor(OpKernelContext *context,
-      : block_size_(block_size), d2s_(d2s) {}
+                        const int block_size,
+                        bool d2s)
+      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -37,14 +37,16 @@
 namespace mace {
 namespace kernels {
-struct DepthwiseConv2dFunctorBase {
+struct DepthwiseConv2dFunctorBase : OpKernel {
-  DepthwiseConv2dFunctorBase(const int *strides,
+  DepthwiseConv2dFunctorBase(OpKernelContext *context,
+                             const int *strides,
                             const Padding padding_type,
                             const std::vector<int> &paddings,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit)
-    : strides_(strides),
+    : OpKernel(context),
+      strides_(strides),
      padding_type_(padding_type),
      paddings_(paddings),
      dilations_(dilations),
@@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor;
 template<>
 struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
  : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                         const Padding padding_type,
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(strides,
+    : DepthwiseConv2dFunctorBase(context,
+                                 strides,
                                 padding_type,
                                 paddings,
                                 dilations,
@@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
 template<>
 struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
    : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                         const Padding padding_type,
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit)
-      : DepthwiseConv2dFunctorBase(strides,
+      : DepthwiseConv2dFunctorBase(context,
+                                   strides,
                                   padding_type,
                                   paddings,
                                   dilations,
@@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
      const int32_t *bias_data = nullptr;
      if (bias == nullptr) {
        zero_bias.reset(
-            new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
+            new Tensor(GetCPUAllocator(), DT_INT32));
        zero_bias->Resize(bias_shape);
        zero_bias->Clear();
        bias_data = zero_bias->data<int32_t>();
@@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
 template<typename T>
 struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
  : DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                         const Padding padding_type,
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(strides,
+    : DepthwiseConv2dFunctorBase(context,
+                                 strides,
                                 padding_type,
                                 paddings,
                                 dilations,

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
  }
 }
-struct EltwiseFunctorBase {
+struct EltwiseFunctorBase : OpKernel {
-  EltwiseFunctorBase(const EltwiseType type,
+  EltwiseFunctorBase(OpKernelContext *context,
+                     const EltwiseType type,
                     const std::vector<float> &coeff,
                     const float scalar_input,
                     const int32_t scalar_input_index,
                     const DataFormat data_format)
-      : type_(type),
+      : OpKernel(context),
+        type_(type),
        coeff_(coeff),
        scalar_input_(scalar_input),
        scalar_input_index_(scalar_input_index),
@@ -823,12 +826,14 @@ struct EltwiseFunctorBase {
 template <DeviceType D, typename T>
 struct EltwiseFunctor : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(OpKernelContext *context,
+                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,  // float as it comes from arg
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(type,
+      : EltwiseFunctorBase(context,
+                           type,
                           coeff,
                           scalar_input,
                           scalar_input_index,
@@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(OpKernelContext *context,
+                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(type,
+      : EltwiseFunctorBase(context,
+                           type,
                           coeff,
                           scalar_input,
                           scalar_input_index,

--- a/mace/kernels/fill.h
+++ b/mace/kernels/fill.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 namespace mace {
@@ -30,8 +31,8 @@ template <DeviceType D, class T>
 struct FillFunctor;
 template <>
-struct FillFunctor<DeviceType::CPU, float> {
+struct FillFunctor<DeviceType::CPU, float> : OpKernel {
-  FillFunctor() {}
+  explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *shape,
                        const Tensor *value,

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -27,10 +27,12 @@
 namespace mace {
 namespace kernels {
-struct FullyConnectedBase {
+struct FullyConnectedBase : OpKernel {
-  FullyConnectedBase(const ActivationType activation,
+  FullyConnectedBase(OpKernelContext *context,
+                     const ActivationType activation,
                     const float relux_max_limit)
-      : activation_(activation),
+      : OpKernel(context),
+        activation_(activation),
        relux_max_limit_(relux_max_limit) {}
  const ActivationType activation_;
@@ -42,9 +44,10 @@ struct FullyConnectedFunctor;
 template <>
 struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *weight,
@@ -86,9 +89,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
 template <>
 struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *weight,
@@ -117,7 +121,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
    const int32_t *bias_ptr = nullptr;
    if (bias == nullptr) {
      zero_bias.reset(
-          new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
+          new Tensor(GetCPUAllocator(), DT_INT32));
      zero_bias->Resize(bias_shape);
      zero_bias->Clear();
      bias_ptr = zero_bias->data<int32_t>();
@@ -148,9 +152,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                  const Tensor *weight,

--- a/mace/kernels/gather.h
+++ b/mace/kernels/gather.h
@@ -21,13 +21,15 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace kernels {
-struct GatherBase {
+struct GatherBase : OpKernel {
-  explicit GatherBase(int axis, float y) : axis_(axis), y_(y) {}
+  GatherBase(OpKernelContext *context, int axis, float y)
+      : OpKernel(context), axis_(axis), y_(y) {}
  int axis_;
  float y_;
@@ -38,7 +40,8 @@ struct GatherFunctor;
 template <>
 struct GatherFunctor<DeviceType::CPU, float> : GatherBase {
-  explicit GatherFunctor(int axis, float y) : GatherBase(axis, y) {}
+  GatherFunctor(OpKernelContext *context, int axis, float y)
+      : GatherBase(context, axis, y) {}
  MaceStatus operator()(const Tensor *params,
                        const Tensor *indices,

--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -1341,8 +1341,8 @@ void Gemm(const float *A,
              ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k);
          const index_t ik_end = std::min(K, ik_begin + this_block_size_k);
-          Tensor trans_a;
+          Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
-          Tensor trans_b;
+          Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
          const float *real_a = nullptr;
          const float *real_b = nullptr;
          float *real_c = c_base + (ih_begin * width + iw_begin);
@@ -1399,8 +1399,8 @@ void GemmRef(const float *A,
             const bool transpose_b) {
  memset(C, 0, sizeof(float) * batch * height * width);
-  Tensor trans_a;
+  Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor trans_b;
+  Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
  float *trans_a_data = nullptr;
  float *trans_b_data = nullptr;
  if (transpose_a) {

--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -20,21 +20,24 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/opencl/common.h"
 namespace mace {
 namespace kernels {
-struct ImageToBufferFunctorBase {
+struct ImageToBufferFunctorBase : OpKernel {
-  explicit ImageToBufferFunctorBase(const int wino_blk_size)
+  ImageToBufferFunctorBase(OpKernelContext *context,
-    : wino_blk_size_(wino_blk_size) {}
+                           const int wino_blk_size)
+    : OpKernel(context),
+      wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };
 template <DeviceType D, typename T>
 struct ImageToBufferFunctor : ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctor(const int wino_blk_size)
+  ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
-    : ImageToBufferFunctorBase(wino_blk_size) {}
+    : ImageToBufferFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
@@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
 template <typename T>
 struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctor(const int wino_blk_size)
+  ImageToBufferFunctor(OpKernelContext *context,
-    : ImageToBufferFunctorBase(wino_blk_size) {}
+                                const int wino_blk_size)
+      : ImageToBufferFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,

--- a/mace/kernels/kernel.h
+++ b/mace/kernels/kernel.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_KERNEL_H_
+#define MACE_KERNELS_KERNEL_H_
+#include "mace/core/op_kernel_context.h"
+namespace mace {
+namespace kernels {
+struct OpKernel {
+  explicit OpKernel(OpKernelContext *context): context_(context) {}
+  OpKernelContext *context_;
+};
+}  // namespace kernels
+}  // namespace mace
+#endif  //  MACE_KERNELS_KERNEL_H_
--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -21,7 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/public/mace.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -34,7 +34,9 @@ template<DeviceType D, typename T>
 struct LocalResponseNormFunctor;
 template<>
-struct LocalResponseNormFunctor<DeviceType::CPU, float> {
+struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
+  explicit LocalResponseNormFunctor(OpKernelContext *context)
+      : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                  int depth_radius,
                  float bias,

--- a/mace/kernels/lstmcell.h
+++ b/mace/kernels/lstmcell.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -35,9 +36,10 @@ template <DeviceType D, typename T>
 struct LSTMCellFunctor;
 template <typename T>
-struct LSTMCellFunctor<DeviceType::GPU, T> {
+struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
-  explicit LSTMCellFunctor(T forget_bias) :
+  LSTMCellFunctor(OpKernelContext *context, T forget_bias)
-    forget_bias_(static_cast<T>(forget_bias)) {}
+      : OpKernel(context),
+        forget_bias_(static_cast<T>(forget_bias)) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *pre_output,
                        const Tensor *weight,

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -29,6 +29,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/gemm.h"
+#include "mace/kernels/kernel.h"
 #include "mace/utils/utils.h"
 #include "mace/kernels/gemmlowp_util.h"
@@ -40,7 +41,8 @@ namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
-struct MatMulFunctor {
+struct MatMulFunctor : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *A,
                        const Tensor *B,
                        Tensor *C,
@@ -87,7 +89,7 @@ struct MatMulFunctor {
      // A * B = (B^T * A^T)^T
      if (!transpose_b) {
        if (B_transpose_.get() == nullptr) {
-          B_transpose_.reset(new Tensor(GetDeviceAllocator(D),
+          B_transpose_.reset(new Tensor(context_->device()->allocator(),
                                        DataTypeToEnum<T>::v()));
          B_transpose_->Resize({batch, width, K});
          Tensor::MappingGuard guardbt(B_transpose_.get());
@@ -112,7 +114,8 @@ struct MatMulFunctor {
 };
 template <>
-struct MatMulFunctor<CPU, uint8_t> {
+struct MatMulFunctor<CPU, uint8_t> : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
  template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
  void MatMulImpl(const Tensor *A,
                  const Tensor *B,
@@ -208,7 +211,8 @@ struct MatMulFunctor<CPU, uint8_t> {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct MatMulFunctor<DeviceType::GPU, T> {
+struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *A,
                        const Tensor *B,
                        Tensor *C,

--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -33,11 +33,11 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
  const index_t channel_blocks = RoundUpDiv4(channels);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
    built_options.emplace("-Dactivation=" + kernel_name);
@@ -94,12 +94,12 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           lws, future));
+                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -34,7 +34,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
  const index_t width = input_tensors[0]->dim(2);
  const index_t channels = input_tensors[0]->dim(3);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  for (size_t i = 1; i < size; ++i) {
    MACE_CHECK_NOTNULL(input_tensors[i]);
@@ -49,7 +49,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
      MACE_NOT_IMPLEMENTED;
    }
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
@@ -96,7 +96,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
             output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
@@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
    built_options.emplace("-Dbias_add=" + kernel_name);
@@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  cl::Event event;
  cl_int error;

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    }
  }
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
@@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -22,13 +22,15 @@ namespace mace {
 namespace kernels {
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
@@ -41,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 }  // namespace
-static MaceStatus Concat2(cl::Kernel *kernel,
+static MaceStatus Concat2(OpKernelContext *context,
+                          cl::Kernel *kernel,
                          const Tensor *input0,
                          const Tensor *input1,
                          const DataType dt,
@@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel,
      static_cast<uint32_t>(batch * height),
  };
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
    built_options.emplace("-Dconcat_channel=" + kernel_name);
@@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel,
    *prev_input_shape = input0->shape();
  }
-  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
  return MACE_SUCCESS;
 }
-static MaceStatus ConcatN(cl::Kernel *kernel,
+static MaceStatus ConcatN(OpKernelContext *context,
+                          cl::Kernel *kernel,
                          const std::vector<const Tensor *> &input_list,
                          const DataType dt,
                          Tensor *output,
@@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
@@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
        static_cast<uint32_t>(batch * height),
    };
-    const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+    const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
    uint32_t idx = 0;
    OUT_OF_RANGE_SET_ARG_PTR;
@@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
      for (size_t j = 0; j < 3; ++j) {
        roundup_gws[j] = RoundUp(gws[j], lws[j]);
      }
-      const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
      error = runtime->command_queue().enqueueNDRangeKernel(
          *kernel, cl::NullRange,
          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
@@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
    }
  }
  if (future != nullptr) {
-    future->wait_fn = [runtime, call_stats](CallStats *stats) {
+    future->wait_fn = [call_stats](CallStats *stats) {
      if (stats != nullptr) {
        stats->start_micros = call_stats.start_micros;
        stats->end_micros = stats->start_micros + call_stats.end_micros;
@@ -234,12 +236,14 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
  switch (inputs_count) {
    case 2:
-      return Concat2(&kernel_, input_list[0], input_list[1],
+      return Concat2(context_,
+                     &kernel_, input_list[0], input_list[1],
                     DataTypeToEnum<T>::value, &input_shape_, output, future,
                     &kwg_size_, &kernel_error_);
    default:
      if (divisible_four) {
-        return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
+        return ConcatN(context_,
+                       &kernel_, input_list, DataTypeToEnum<T>::value, output,
                       future, &kwg_size_, &kernel_error_);
      } else {
        MACE_NOT_IMPLEMENTED;

--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
@@ -18,7 +18,8 @@
 namespace mace {
 namespace kernels {
-extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
                                   uint32_t *kwg_size,
                                   std::unique_ptr<BufferBase> *kernel_error);
-extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
                                   uint32_t *kwg_size,
                                   std::unique_ptr<BufferBase> *kernel_error);
-extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpencl(OpKernelContext *runtime,
+                               cl::Kernel *kernel,
                               const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
@@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                         Tensor *output,
                                                         StatsFuture *future) {
  typedef MaceStatus (*Conv2dOpenclFunction)(
-      cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
+      OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input,
-      const Tensor *bias, const int stride, const int *padding,
+      const Tensor *filter, const Tensor *bias, const int stride,
-      const int *dilations, const ActivationType activation,
+      const int *padding, const int *dilations,
+      const ActivationType activation,
      const float relux_max_limit, const DataType dt,
      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
      uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
@@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  if (kernel_h == kernel_w && kernel_h <= 3 &&
      selector[kernel_h - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1];
-    return conv2d_func(
+    return conv2d_func(context_,
        &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
        activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
        output, future, &kwg_size_, &kernel_error_);
  } else {
-    return Conv2dOpencl(
+    return Conv2dOpencl(context_,
        &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
        activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
        output, future, &kwg_size_, &kernel_error_);

--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -25,14 +25,16 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 // TODO(liuqi): Fix the specific value.
 const uint32_t lws_limit = 128;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
-    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+    uint32_t compute_units = runtime->device_compute_units();
    const uint32_t base =
        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
@@ -62,7 +64,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 }  // namespace
-extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
  const index_t width_blocks = RoundUpDiv4(width);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    MACE_CHECK(input_batch == batch);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
@@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }
-  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -24,15 +24,17 @@ namespace kernels {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t compute_units = std::max<uint32_t>(
-        OpenCLRuntime::Global()->device_compute_units() / 2, 1);
+        runtime->device_compute_units() / 2, 1);
    const uint32_t base =
        std::max<uint32_t>(
            std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
@@ -55,7 +57,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 }  // namespace
-extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
@@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }
-  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -26,7 +26,8 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 // TODO(liuqi): Fix the specific value.
 const uint32_t lws_limit = 20;
-std::vector<uint32_t> LocalWS(const uint32_t *gws,
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
                              const uint32_t kernel_size,
                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
@@ -34,8 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
-    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+    uint32_t compute_units = runtime->device_compute_units();
    const uint32_t base =
        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
@@ -64,7 +65,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
 }  // namespace
-extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpencl(OpKernelContext *context,
+                               cl::Kernel *kernel,
                               const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
@@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv4(width);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
    built_options.emplace("-Dconv_2d=" + kernel_name);
@@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
      Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
  std::vector<uint32_t> lws =
-      LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
+      LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);

--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
@@ -22,13 +22,15 @@ namespace mace {
 namespace kernels {
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
@@ -132,11 +134,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
      static_cast<uint32_t>(output->dim(0) * output->dim(1))
  };
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
    built_options.emplace("-Dcrop=" + kernel_name);
@@ -167,11 +169,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input0->shape();
  }
-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
@@ -20,7 +20,8 @@ namespace kernels {
 namespace {
-MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
+MaceStatus Deconv2dOpencl(OpKernelContext *context,
+                          cl::Kernel *kernel,
                          const Tensor *input,
                          const Tensor *filter,
                          const Tensor *bias,
@@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
  const int align_w = stride_w - 1 - padding_w;
  const int kernel_size = filter->dim(2) * filter->dim(3);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
    built_options.emplace("-Ddeconv_2d=" + kernel_name);
@@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
                  &output_image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(),
+  return Deconv2dOpencl(context_, &kernel_, input, filter, bias,
-                        paddings.data(), activation_, relux_max_limit_,
+                        strides_.data(), paddings.data(), activation_,
-                        DataTypeToEnum<T>::value, &input_shape_, output, future,
+                        relux_max_limit_, DataTypeToEnum<T>::value,
+                        &input_shape_, output, future,
                        &kwg_size_, &kernel_error_);
 }

--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
@@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -24,13 +24,15 @@ namespace kernels {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = cache_size / kBaseGPUMemCacheSize;
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (lws[1] >= base) {
@@ -58,7 +60,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
 }  // namespace
-static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
+static MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                                  cl::Kernel *kernel,
                                  const Tensor *input,   // NHWC
                                  const Tensor *filter,  // HWIM
                                  const Tensor *bias,
@@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }
-  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
  index_t kernel_h = filter->dim(2);
  index_t kernel_w = filter->dim(3);
  if (strides_[0] != strides_[1]) {
-    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
+    LOG(FATAL) << "GPU depthwise conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
+               << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides_[0] << "x" << strides_[1]
+               << " stride " << strides_[0] << "x" << strides_[1]
-                 << " is not implemented yet, using slow version";
+               << " is not implemented yet.";
-    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
-    return DepthwiseConv2dFunctor<DeviceType::CPU, float>(
-        strides_, padding_type_, paddings_, dilations_, activation_,
-        relux_max_limit_)(input, filter, bias, output, future);
  }
  // Create a fake conv_2d filter to calculate the paddings and output size
@@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
  return DepthwiseConv2d(
+      context_,
      &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
      activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
      output, future, &kwg_size_, &kernel_error_);

--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(batch_height_pixels)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
    input_shape_ = input0->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -22,7 +22,8 @@ namespace kernels {
 namespace {
 template <typename T>
-MaceStatus FCWXKernel(cl::Kernel *kernel,
+MaceStatus FCWXKernel(OpKernelContext *context,
+                      cl::Kernel *kernel,
                      const Tensor *input,
                      const Tensor *weight,
                      const Tensor *bias,
@@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
                      std::unique_ptr<BufferBase> *kernel_error) {
  MACE_CHECK_NOTNULL(gws);
  MACE_CHECK_NOTNULL(lws);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    const index_t batch = output->dim(0);
@@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
    const index_t output_blocks = RoundUpDiv4(output_size);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
@@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
 }
 template <typename T>
-MaceStatus FCWTXKernel(cl::Kernel *kernel,
+MaceStatus FCWTXKernel(OpKernelContext *context,
+                       cl::Kernel *kernel,
                       const Tensor *input,
                       const Tensor *weight,
                       const Tensor *bias,
@@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
                       std::unique_ptr<BufferBase> *kernel_error) {
  MACE_CHECK_NOTNULL(gws);
  MACE_CHECK_NOTNULL(lws);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
@@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
  std::string tuning_key =
      Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
                                           gws->data(), *lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
                  &output_image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
+  return FCWXKernel<T>(context_,
+                       &kernel_, input, weight, bias, &input_shape_, output,
                       activation_, &gws_, &lws_, relux_max_limit_, future,
                       &kernel_error_);
 }

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
  }
 }
-std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
+                                       const uint32_t *gws,
                                       const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
-    uint64_t cache_size =
+    uint64_t cache_size = runtime->device_global_mem_cache_size();
-        OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[2] =
@@ -245,13 +245,12 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
  return lws;
 }
-MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
                               StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
    }
    return error;
  };
-  OpenCLProfilingTimer timer(&event);
+  OpenCLProfilingTimer timer(runtime, &event);
-  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);
  MACE_CL_RET_STATUS(err);
  if (future != nullptr) {
-    future->wait_fn = [event](CallStats *stats) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();
      if (stats != nullptr) {
-        OpenCLRuntime::Global()->GetCallStats(event, stats);
+        runtime->GetCallStats(event, stats);
      }
    };
  }
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
                               StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
    }
    return error;
  };
-  OpenCLProfilingTimer timer(&event);
+  OpenCLProfilingTimer timer(runtime, &event);
-  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);
  MACE_CL_RET_STATUS(err);

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -31,11 +31,11 @@
 namespace mace {
 namespace kernels {
-#define OUT_OF_RANGE_CONFIG(kernel_error)                   \
+#define OUT_OF_RANGE_CONFIG(kernel_error, context)          \
  if (runtime->IsOutOfRangeCheckEnabled()) {                \
    built_options.emplace("-DOUT_OF_RANGE_CHECK");          \
    (kernel_error) = std::move(std::unique_ptr<Buffer>(     \
-        new Buffer(GetDeviceAllocator(DeviceType::GPU))));  \
+        new Buffer((context)->device()->allocator())));     \
    MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1));      \
    (kernel_error)->Map(nullptr);                           \
    *((kernel_error)->mutable_data<char>()) = 0;            \
@@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt);
 std::string DtToUpCompatibleCLDt(const DataType dt);
 // Tuning or Run OpenCL kernel with 3D work group size
-MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
                               StatsFuture *future);
 // Tuning or Run OpenCL kernel with 2D work group size
-MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
@@ -162,7 +164,8 @@ std::string Concat(Args... args) {
  return ss.str();
 }
-std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
+                                       const uint32_t *gws,
                                       const uint32_t kwg_size);
 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
      break;
  }
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;

--- a/mace/kernels/opencl/lstmcell.cc
+++ b/mace/kernels/opencl/lstmcell.cc
@@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
  const index_t width = input->dim(1);
  const index_t width_blocks = width / 4;
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
@@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
  std::string tuning_key =
      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -53,11 +53,11 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
      static_cast<uint32_t>(height_blocks * batch),
  };
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
@@ -84,7 +84,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -16,6 +16,8 @@
 #include <vector>
 #include "gtest/gtest.h"
+#include "mace/core/op_kernel_context.h"
+#include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
@@ -25,14 +27,15 @@ namespace mace {
 namespace kernels {
 namespace {
-bool BufferToImageOpImpl(Tensor *buffer,
+bool BufferToImageOpImpl(OpKernelContext *context,
+                         Tensor *buffer,
                         Tensor *image,
                         const std::vector<size_t> &image_shape) {
  std::unique_ptr<BufferBase> kernel_error;
  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                     static_cast<uint32_t>(image_shape[1])};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  std::string kernel_name = "in_out_buffer_to_image";
  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
@@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
  std::stringstream kernel_name_ss;
  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
  built_options.emplace(kernel_name_ss.str());
-  OUT_OF_RANGE_CONFIG(kernel_error);
+  OUT_OF_RANGE_CONFIG(kernel_error, context);
  NON_UNIFORM_WG_CONFIG;
  if (buffer->dtype() == image->dtype()) {
    built_options.emplace("-DDATA_TYPE=" +
@@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) {
  index_t width = 7;
  index_t channels = 11;
-  std::vector<index_t> buffer_shape = {batch, height, width, channels};
+  GPUContext gpu_context;
+  std::unique_ptr<Device> device(new GPUDevice(gpu_context.opencl_tuner()));
  Workspace ws;
+  OpKernelContext context(&ws, device.get());
+  std::vector<index_t> buffer_shape = {batch, height, width, channels};
  Tensor *buffer =
-      ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
+      ws.CreateTensor("Buffer", device->allocator(),
                      DataTypeToEnum<float>::v());
  buffer->Resize(buffer_shape);
  std::vector<size_t> image_shape;
-  Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
+  Tensor *image = ws.CreateTensor("Image", device->allocator(),
                                  DataTypeToEnum<float>::v());
  CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
  image->ResizeImage(buffer->shape(), image_shape);
-  ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape));
+  ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape));
  std::vector<size_t> overflow_image_shape = image_shape;
  for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
    overflow_image_shape[i] += 1;
  }
-  ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape));
+  ASSERT_TRUE(BufferToImageOpImpl(&context,
+                                  buffer,
+                                  image,
+                                  overflow_image_shape));
 }
 }  // namespace kernels

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -47,11 +47,11 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  const index_t channel_blocks = RoundUpDiv4(channels);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
    built_options.emplace("-Dpad=" + kernel_name);
@@ -85,10 +85,10 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
                                  output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -23,13 +23,15 @@ namespace kernels {
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[2] =
@@ -54,12 +56,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
      << "Pooling opencl kernel not support dilation yet";
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    const DataType dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
    built_options.emplace("-Dpooling=" + kernel_name);
@@ -149,11 +151,11 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    };
  }
-  const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws.data(), kwg_size_);
  std::string tuning_key =
      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws.data(), lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/reduce_mean.cc
+++ b/mace/kernels/opencl/reduce_mean.cc
@@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
  const index_t channel_blocks = RoundUpDiv4(channels);
  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  std::vector<uint32_t> gws(3);
  std::vector<uint32_t> lws(3);
  std::vector<index_t> output_shape{batch, 1, 1, channels};
@@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
  if (kernel_.get() == nullptr) {
    const DataType dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
    built_options.emplace("-Dreduce_mean=" + kernel_name);

--- a/mace/kernels/opencl/resize_bicubic.cc
+++ b/mace/kernels/opencl/resize_bicubic.cc
@@ -23,9 +23,11 @@ namespace mace {
 namespace kernels {
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint64_t cache_size = runtime->device_global_mem_cache_size();
  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
  if (lws[1] >= base) {
@@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(out_width),
                           static_cast<uint32_t>(out_height * batch)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
+    auto dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
    built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
@@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
          Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
                 output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -23,13 +23,15 @@ namespace mace {
 namespace kernels {
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (lws[1] >= base) {
@@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(out_width),
                           static_cast<uint32_t>(out_height * batch)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
@@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }
-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -24,13 +24,15 @@ namespace kernels {
 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (gws[0] < base) {
@@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
    built_options.emplace("-Dsoftmax=" + kernel_name);
@@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
    input_shape_ = logits->shape();
  }
-  std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
@@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    space_shape_ = space_tensor->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/split.cc
+++ b/mace/kernels/opencl/split.cc
@@ -40,11 +40,11 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
        output_list[i]->ResizeImage(output_shape, image_shape));
  }
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
    built_options.emplace("-Dsplit=" + kernel_name);
@@ -66,7 +66,7 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
  };
-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  cl::Event event;
  CallStats call_stats{INT64_MAX, 0};
  for (size_t i = 0; i < outputs_count; ++i) {

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -24,12 +24,12 @@ namespace kernels {
 template <typename T>
 MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    if (wino_blk_size_ == 4) {
      obfuscated_kernel_name =
@@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
                                  output_tensor->dim(0),
                                  output_tensor->dim(1),
                                  output_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
@@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor*> &inputs,
    Tensor *output_tensor,
    StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  const Tensor *input_tensor = inputs[0];
  const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
@@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    if (wino_blk_size_ == 4) {
      obfuscated_kernel_name =
@@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
             output_tensor->dim(1), output_tensor->dim(2),
             output_tensor->dim(3), input_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -29,10 +30,13 @@
 namespace mace {
 namespace kernels {
-struct PadFunctorBase {
+struct PadFunctorBase : OpKernel {
-  PadFunctorBase(const std::vector<int> &paddings,
+  PadFunctorBase(OpKernelContext *context,
+                 const std::vector<int> &paddings,
                 const float constant_value)
-      : paddings_(paddings), constant_value_(constant_value) {}
+      : OpKernel(context),
+        paddings_(paddings),
+        constant_value_(constant_value) {}
  std::vector<int> paddings_;
  float constant_value_;
@@ -40,9 +44,10 @@ struct PadFunctorBase {
 template<DeviceType D, typename T>
 struct PadFunctor : public PadFunctorBase {
-  PadFunctor(const std::vector<int> &paddings,
+  PadFunctor(OpKernelContext *context,
+             const std::vector<int> &paddings,
             const float constant_value)
-      : PadFunctorBase(paddings, constant_value) {}
+      : PadFunctorBase(context, paddings, constant_value) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
-  PadFunctor(const std::vector<int> &paddings,
+  PadFunctor(OpKernelContext *context,
+             const std::vector<int> &paddings,
             const float constant_value)
-      : PadFunctorBase(paddings, constant_value) {}
+      : PadFunctorBase(context, paddings, constant_value) {}
  MaceStatus operator()(const Tensor *input,
                  Tensor *output,

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/kernel.h"
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -41,14 +42,16 @@ enum PoolingType {
 namespace kernels {
-struct PoolingFunctorBase {
+struct PoolingFunctorBase : OpKernel {
-  PoolingFunctorBase(const PoolingType pooling_type,
+  PoolingFunctorBase(OpKernelContext *context,
+                     const PoolingType pooling_type,
                     const int *kernels,
                     const int *strides,
                     const Padding padding_type,
                     const std::vector<int> &paddings,
                     const int *dilations)
-      : pooling_type_(pooling_type),
+      : OpKernel(context),
+        pooling_type_(pooling_type),
        kernels_(kernels),
        strides_(strides),
        padding_type_(padding_type),
@@ -68,14 +71,20 @@ struct PoolingFunctor;
 template <>
 struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(
+      : PoolingFunctorBase(context,
-            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {
  }
  void MaxPooling(const float *input,
@@ -231,15 +240,20 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
 template <>
 struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(
+      : PoolingFunctorBase(context,
-      pooling_type, kernels, strides, padding_type, paddings, dilations) {
+                           pooling_type,
-  }
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {}
  void MaxPooling(const uint8_t *input,
                  const index_t *in_shape,
@@ -443,14 +457,20 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(
+      : PoolingFunctorBase(context,
-            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {
  }
  MaceStatus operator()(const Tensor *input_tensor,
                  Tensor *output_tensor,

--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 namespace mace {
@@ -121,8 +122,9 @@ inline std::vector<int> nms(const float *bboxes_ptr,
 template<DeviceType D, typename T>
-struct ProposalFunctor {
+struct ProposalFunctor : OpKernel {
-  ProposalFunctor(const int min_size,
+  ProposalFunctor(OpKernelContext *context,
+                  const int min_size,
                  const float nms_thresh,
                  const int pre_nms_top_n,
                  const int post_nms_top_n,
@@ -130,6 +132,7 @@ struct ProposalFunctor {
                  const int base_size,
                  const std::vector<int> &scales,
                  const std::vector<float> &ratios) :
+      OpKernel(context),
      min_size_(min_size),
      thresh_(nms_thresh),
      pre_nms_top_n_(pre_nms_top_n),

--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 namespace mace {
 namespace kernels {
@@ -173,8 +174,8 @@ template<DeviceType D, typename T>
 struct QuantizeFunctor;
 template<>
-struct QuantizeFunctor<CPU, uint8_t> {
+struct QuantizeFunctor<CPU, uint8_t> : OpKernel {
-  QuantizeFunctor() {}
+  explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                        const bool non_zero,
@@ -212,8 +213,8 @@ template<DeviceType D, typename T>
 struct DequantizeFunctor;
 template<>
-struct DequantizeFunctor<CPU, uint8_t> {
+struct DequantizeFunctor<CPU, uint8_t> : OpKernel {
-  DequantizeFunctor() {}
+  explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/reduce_mean.h
+++ b/mace/kernels/reduce_mean.h
@@ -24,6 +24,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
 #endif
@@ -31,10 +32,12 @@
 namespace mace {
 namespace kernels {
-struct ReduceFunctorBase {
+struct ReduceFunctorBase : OpKernel {
-  ReduceFunctorBase(const std::vector<int> &axis,
+  ReduceFunctorBase(OpKernelContext *context,
+                    const std::vector<int> &axis,
                    const bool keep_dims)
-      : keep_dims_(keep_dims),
+      : OpKernel(context),
+        keep_dims_(keep_dims),
        axis_(axis) {}
  bool keep_dims_;
  bool reduce_first_axis_;
@@ -44,10 +47,11 @@ struct ReduceFunctorBase {
 };
 template <DeviceType D, typename T>
-struct ReduceMeanFunctor : ReduceFunctorBase{
+struct ReduceMeanFunctor : ReduceFunctorBase {
-  ReduceMeanFunctor(const std::vector<int> &axis,
+  ReduceMeanFunctor(OpKernelContext *context,
+                    const std::vector<int> &axis,
                    const bool keep_dims)
-      : ReduceFunctorBase(axis, keep_dims) {}
+      : ReduceFunctorBase(context, axis, keep_dims) {}
  void Simplify(const Tensor *input) {
    std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
@@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{
 template <typename T>
 struct ReduceMeanFunctor<DeviceType::GPU, T>
    : ReduceFunctorBase {
-  ReduceMeanFunctor(const std::vector<int> axis,
+  ReduceMeanFunctor(OpKernelContext *context,
+                    const std::vector<int> axis,
                    const bool keep_dims)
-      : ReduceFunctorBase(axis, keep_dims) {}
+      : ReduceFunctorBase(context, axis, keep_dims) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output_tensor,

--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -19,17 +19,14 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
-struct ReshapeFunctor {
+struct ReshapeFunctor : OpKernel {
-  ReshapeFunctor() {}
+  explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                  const std::vector<index_t> &out_shape,

--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/utils/logging.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -137,10 +138,11 @@ inline void ResizeImage(const float *images,
  }
 }
-struct ResizeBicubicFunctorBase {
+struct ResizeBicubicFunctorBase : OpKernel {
-  ResizeBicubicFunctorBase(const std::vector<index_t> &size,
+  ResizeBicubicFunctorBase(OpKernelContext *context,
+                           const std::vector<index_t> &size,
                           bool align_corners)
-      : align_corners_(align_corners) {
+      : OpKernel(context), align_corners_(align_corners) {
    MACE_CHECK(size.size() == 2);
    out_height_ = size[0];
    out_width_ = size[1];
@@ -158,8 +160,10 @@ struct ResizeBicubicFunctor;
 template<>
 struct ResizeBicubicFunctor<DeviceType::CPU, float>
    : ResizeBicubicFunctorBase {
-  ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
+  ResizeBicubicFunctor(OpKernelContext *context,
-      : ResizeBicubicFunctorBase(size, align_corners) {}
+                       const std::vector<index_t> &size,
+                       bool align_corners)
+      : ResizeBicubicFunctorBase(context, size, align_corners) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -204,8 +208,10 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float>
 template<typename T>
 struct ResizeBicubicFunctor<DeviceType::GPU, T>
    : ResizeBicubicFunctorBase {
-  ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
+  ResizeBicubicFunctor(OpKernelContext *context,
-      : ResizeBicubicFunctorBase(size, align_corners) {}
+                       const std::vector<index_t> &size,
+                       bool align_corners)
+      : ResizeBicubicFunctorBase(context, size, align_corners) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -113,10 +114,12 @@ inline void ResizeImage(const float *images,
  }
 }
-struct ResizeBilinearFunctorBase {
+struct ResizeBilinearFunctorBase : OpKernel {
-  ResizeBilinearFunctorBase(const std::vector<index_t> &size,
+  ResizeBilinearFunctorBase(OpKernelContext *context,
+                            const std::vector<index_t> &size,
                            bool align_corners)
-      : align_corners_(align_corners) {
+      : OpKernel(context),
+        align_corners_(align_corners) {
    MACE_CHECK(size.size() == 2);
    out_height_ = size[0];
    out_width_ = size[1];
@@ -134,8 +137,10 @@ struct ResizeBilinearFunctor;
 template<>
 struct ResizeBilinearFunctor<DeviceType::CPU, float>
    : ResizeBilinearFunctorBase {
-  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
+  ResizeBilinearFunctor(OpKernelContext *context,
-      : ResizeBilinearFunctorBase(size, align_corners) {}
+                        const std::vector<index_t> &size,
+                        bool align_corners)
+      : ResizeBilinearFunctorBase(context, size, align_corners) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -187,8 +192,10 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
 template<typename T>
 struct ResizeBilinearFunctor<DeviceType::GPU, T>
    : ResizeBilinearFunctorBase {
-  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
+  ResizeBilinearFunctor(OpKernelContext *context,
-      : ResizeBilinearFunctorBase(size, align_corners) {}
+                        const std::vector<index_t> &size,
+                        bool align_corners)
+      : ResizeBilinearFunctorBase(context, size, align_corners) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/scalar_math.h
+++ b/mace/kernels/scalar_math.h
@@ -89,12 +89,14 @@ void ScalarEltwise(const T* in0,
 template <DeviceType D, typename T>
-struct ScalarMathFunctor {
+struct ScalarMathFunctor : OpKernel {
-  explicit ScalarMathFunctor(const EltwiseType type,
+  ScalarMathFunctor(OpKernelContext *context,
-                             const std::vector<float> &coeff,
+                    const EltwiseType type,
-                             const float scalar_input,
+                    const std::vector<float> &coeff,
-                             const int32_t scalar_input_index)
+                    const float scalar_input,
-      : type_(type),
+                    const int32_t scalar_input_index)
+      : OpKernel(context),
+        type_(type),
        coeff_(coeff),
        scalar_input_(scalar_input),
        scalar_input_index_(scalar_input_index) {}

--- a/mace/kernels/sgemm.h
+++ b/mace/kernels/sgemm.h
@@ -89,7 +89,7 @@ typedef Major PackOrder;
 template<typename T>
 class PackedBlock {
 public:
-  PackedBlock() : data_tensor_(GetDeviceAllocator(CPU),
+  PackedBlock() : data_tensor_(GetCPUAllocator(),
                               DataTypeToEnum<T>::v()) {}
  const T *data() {

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -27,6 +27,7 @@
 #include "mace/utils/utils.h"
 #include "mace/kernels/fixpoint.h"
 #include "mace/kernels/gemmlowp_util.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -40,7 +41,8 @@ template<DeviceType D, typename T>
 struct SoftmaxFunctor;
 template<>
-struct SoftmaxFunctor<DeviceType::CPU, float> {
+struct SoftmaxFunctor<DeviceType::CPU, float> : OpKernel {
+  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future) {
@@ -127,7 +129,8 @@ static const int kInputDeltaIntBits = 6;
 static const int kSumExpIntBits = 12;
 template<>
-struct SoftmaxFunctor<DeviceType::CPU, uint8_t> {
+struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
+  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future) {
@@ -354,7 +357,8 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct SoftmaxFunctor<DeviceType::GPU, T> {
+struct SoftmaxFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *logits,
                        Tensor *output,
                        StatsFuture *future);

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -21,7 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/public/mace.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -30,11 +30,13 @@
 namespace mace {
 namespace kernels {
-struct SpaceToBatchFunctorBase {
+struct SpaceToBatchFunctorBase : OpKernel {
-  SpaceToBatchFunctorBase(const std::vector<int> &paddings,
+  SpaceToBatchFunctorBase(OpKernelContext *context,
+                          const std::vector<int> &paddings,
                          const std::vector<int> &block_shape,
                          bool b2s)
-    : paddings_(paddings.begin(), paddings.end()),
+    : OpKernel(context),
+      paddings_(paddings.begin(), paddings.end()),
      block_shape_(block_shape.begin(), block_shape.end()),
      b2s_(b2s) {
    MACE_CHECK(
@@ -135,10 +137,11 @@ struct SpaceToBatchFunctor;
 template<>
 struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
-  SpaceToBatchFunctor(const std::vector<int> &paddings,
+  SpaceToBatchFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
                      const std::vector<int> &block_shape,
                      bool b2s)
-    : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
+    : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
  MaceStatus operator()(Tensor *space_tensor,
                  Tensor *batch_tensor,
@@ -319,10 +322,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
-  SpaceToBatchFunctor(const std::vector<int> &paddings,
+  SpaceToBatchFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
                      const std::vector<int> &block_shape,
                      bool b2s)
-      : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
+      : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
  MaceStatus operator()(Tensor *space_tensor,
                  Tensor *batch_tensor,

--- a/mace/kernels/split.h
+++ b/mace/kernels/split.h
@@ -22,6 +22,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -31,15 +32,17 @@
 namespace mace {
 namespace kernels {
-struct SplitFunctorBase {
+struct SplitFunctorBase : OpKernel {
-  explicit SplitFunctorBase(const int32_t axis) : axis_(axis) {}
+  SplitFunctorBase(OpKernelContext *context, const int32_t axis)
+      : OpKernel(context), axis_(axis) {}
  int32_t axis_;
 };
 template<DeviceType D, typename T>
 struct SplitFunctor : SplitFunctorBase {
-  explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {}
+  SplitFunctor(OpKernelContext *context, const int32_t axis)
+      : SplitFunctorBase(context, axis) {}
  MaceStatus operator()(const Tensor *input,
                  const std::vector<Tensor *> &output_list,
@@ -90,11 +93,12 @@ struct SplitFunctor : SplitFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct SplitFunctor<DeviceType::GPU, T> : SplitFunctorBase {
-  explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {}
+  SplitFunctor(OpKernelContext *context, const int32_t axis)
+      : SplitFunctorBase(context, axis) {}
  MaceStatus operator()(const Tensor *input,
-                  const std::vector<Tensor *> &output_list,
+                        const std::vector<Tensor *> &output_list,
-                  StatsFuture *future);
+                        StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::unique_ptr<BufferBase> kernel_error_;

--- a/mace/kernels/stack.h
+++ b/mace/kernels/stack.h
@@ -22,14 +22,16 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
-struct StackFunctor {
+struct StackFunctor : OpKernel {
-  explicit StackFunctor(int axis) : axis_(axis) {}
+  StackFunctor(OpKernelContext *context, int axis)
+      : OpKernel(context), axis_(axis) {}
  MaceStatus operator()(const std::vector<const Tensor *> &inputs,
                        Tensor *output,

--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.h
@@ -21,26 +21,29 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
-struct StridedSliceFunctor {
+struct StridedSliceFunctor : OpKernel {
-  StridedSliceFunctor(int begin_mask,
+  StridedSliceFunctor(OpKernelContext *context,
+                      int begin_mask,
                      int end_mask,
                      int ellipsis_mask,
                      int new_axis_mask,
                      int shrink_axis_mask,
                      bool is_slice)
-      : begin_mask_(begin_mask),
+      : OpKernel(context),
+        begin_mask_(begin_mask),
        end_mask_(end_mask),
        ellipsis_mask_(ellipsis_mask),
        new_axis_mask_(new_axis_mask),
        shrink_axis_mask_(shrink_axis_mask),
        is_slice_(is_slice),
-        tmp_strides_tensor_(GetDeviceAllocator(D),
+        tmp_strides_tensor_(context->device()->allocator(),
                            DataTypeToEnum<int32_t>::v()) {}
  MaceStatus operator()(const Tensor *input,

--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
@@ -105,8 +105,9 @@ static void TransposeNCHWToNHWCC2(const float *input,
 }
 template<DeviceType D, typename T>
-struct TransposeFunctor {
+struct TransposeFunctor : OpKernel {
-  explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {}
+  TransposeFunctor(OpKernelContext *context, const std::vector<int> &dims)
+      : OpKernel(context), dims_(dims) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/unstack.h
+++ b/mace/kernels/unstack.h
@@ -22,14 +22,16 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
-struct UnstackFunctor {
+struct UnstackFunctor : OpKernel {
-  explicit UnstackFunctor(int axis) : axis_(axis) {}
+  UnstackFunctor(OpKernelContext *context, int axis)
+      : OpKernel(context), axis_(axis) {}
  MaceStatus operator()(const Tensor *input,
                        const std::vector<Tensor *> &outputs,

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -30,11 +30,13 @@
 namespace mace {
 namespace kernels {
-struct WinogradTransformFunctorBase {
+struct WinogradTransformFunctorBase : OpKernel {
-  WinogradTransformFunctorBase(const Padding &padding_type,
+  WinogradTransformFunctorBase(OpKernelContext *context,
+                               const Padding &padding_type,
                               const std::vector<int> &paddings,
                               const int block_size)
-      : strides_({1, 1}),
+      : OpKernel(context),
+        strides_({1, 1}),
        dilations_({1, 1}),
        padding_type_(padding_type),
        paddings_(paddings),
@@ -49,10 +51,14 @@ struct WinogradTransformFunctorBase {
 template<DeviceType D, typename T>
 struct WinogradTransformFunctor : WinogradTransformFunctorBase {
-  WinogradTransformFunctor(const Padding &padding_type,
+  WinogradTransformFunctor(OpKernelContext *context,
+                           const Padding &padding_type,
                           const std::vector<int> &paddings,
                           const int block_size)
-      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
+      : WinogradTransformFunctorBase(context,
+                                     padding_type,
+                                     paddings,
+                                     block_size) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -69,10 +75,14 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
 template<typename T>
 struct WinogradTransformFunctor<DeviceType::GPU, T>
    : WinogradTransformFunctorBase {
-  WinogradTransformFunctor(const Padding &padding_type,
+  WinogradTransformFunctor(OpKernelContext *context,
+                           const Padding &padding_type,
                           const std::vector<int> &paddings,
                           const int block_size)
-      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
+      : WinogradTransformFunctorBase(context,
+                                     padding_type,
+                                     paddings,
+                                     block_size) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -85,11 +95,13 @@ struct WinogradTransformFunctor<DeviceType::GPU, T>
 };
 #endif  // MACE_ENABLE_OPENCL
-struct WinogradInverseTransformFunctorBase {
+struct WinogradInverseTransformFunctorBase : OpKernel {
-  WinogradInverseTransformFunctorBase(const ActivationType activation,
+  WinogradInverseTransformFunctorBase(OpKernelContext *context,
+                                      const ActivationType activation,
                                      const float relux_max_limit,
                                      const int block_size)
-      : wino_blk_size_(block_size),
+      : OpKernel(context),
+        wino_blk_size_(block_size),
        activation_(activation),
        relux_max_limit_(relux_max_limit) {}
@@ -100,11 +112,12 @@ struct WinogradInverseTransformFunctorBase {
 template<DeviceType D, typename T>
 struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
-  WinogradInverseTransformFunctor(const ActivationType activation,
+  WinogradInverseTransformFunctor(OpKernelContext *context,
+                                  const ActivationType activation,
                                  const float relux_max_limit,
                                  const int block_size)
      : WinogradInverseTransformFunctorBase(
-            activation, relux_max_limit, block_size) {}
+            context, activation, relux_max_limit, block_size) {}
  MaceStatus operator()(const std::vector<const Tensor*> &inputs,
                        Tensor *output,
@@ -121,11 +134,12 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
 template <typename T>
 struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
    : WinogradInverseTransformFunctorBase {
-  WinogradInverseTransformFunctor(const ActivationType activation,
+  WinogradInverseTransformFunctor(OpKernelContext *context,
+                                  const ActivationType activation,
                                  const float relux_max_limit,
                                  const int block_size)
      : WinogradInverseTransformFunctorBase(
-            activation, relux_max_limit, block_size) {}
+            context, activation, relux_max_limit, block_size) {}
  MaceStatus operator()(const std::vector<const Tensor*> &inputs,
                  Tensor *output,

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -21,10 +21,12 @@
 #include <memory>
 #include "mace/core/net.h"
+#include "mace/core/device_context.h"
 #include "mace/ops/ops_register.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -63,9 +65,9 @@ void UnloadModelData(const unsigned char *model_data,
 }
 #ifdef MACE_ENABLE_OPENCL
-MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
+MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
  // Check OpenCL avaliable
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = device->opencl_runtime();
  if (!runtime->is_opencl_avaliable()) {
    return MaceStatus::MACE_OUT_OF_RESOURCES;
  }
@@ -101,6 +103,199 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
 }  // namespace
+class GPUContextBuilder::Impl {
+ public:
+  void SetStoragePath(const std::string &path);
+  void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
+  void SetOpenCLParameterPath(const std::string &path);
+  std::shared_ptr<GPUContext> Finalize();
+ public:
+  std::string storage_path_;
+  std::vector<std::string> opencl_binary_paths_;
+  std::string opencl_parameter_path_;
+};
+void GPUContextBuilder::Impl::SetStoragePath(const std::string &path) {
+  storage_path_ = path;
+}
+void GPUContextBuilder::Impl::SetOpenCLBinaryPaths(
+    const std::vector<std::string> &paths) {
+  opencl_binary_paths_ = paths;
+}
+void GPUContextBuilder::Impl::SetOpenCLParameterPath(
+    const std::string &path) {
+  opencl_parameter_path_ = path;
+}
+std::shared_ptr<GPUContext> GPUContextBuilder::Impl::Finalize() {
+  return std::shared_ptr<GPUContext>(new GPUContext(storage_path_,
+                                                    opencl_binary_paths_,
+                                                    opencl_parameter_path_));
+}
+GPUContextBuilder::GPUContextBuilder() : impl_(new GPUContextBuilder::Impl) {}
+GPUContextBuilder::~GPUContextBuilder() = default;
+GPUContextBuilder &GPUContextBuilder::SetStoragePath(const std::string &path) {
+  impl_->SetStoragePath(path);
+  return *this;
+}
+GPUContextBuilder &GPUContextBuilder::SetOpenCLBinaryPaths(
+    const std::vector<std::string> &paths) {
+  impl_->SetOpenCLBinaryPaths(paths);
+  return *this;
+}
+GPUContextBuilder &GPUContextBuilder::SetOpenCLParameterPath(
+    const std::string &path) {
+  impl_->SetOpenCLParameterPath(path);
+  return *this;
+}
+std::shared_ptr<GPUContext> GPUContextBuilder::Finalize() {
+  return impl_->Finalize();
+}
+class MaceEngineConfig::Impl {
+ public:
+  explicit Impl(const DeviceType device_type);
+  ~Impl() = default;
+  MaceStatus SetGPUContext(std::shared_ptr<GPUContext> context);
+  MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
+  MaceStatus SetCPUThreadPolicy(int num_threads_hint,
+                                CPUAffinityPolicy policy,
+                                bool use_gemmlowp);
+  MaceStatus SetOpenMPThreadAffinity(int num_threads,
+                                     const std::vector<int> &cpu_ids);
+  inline DeviceType device_type() const {
+    return device_type_;
+  }
+  inline int num_threads() const {
+    return num_threads_;
+  }
+  inline std::shared_ptr<GPUContext> gpu_context() const {
+    return gpu_context_;
+  }
+  inline GPUPriorityHint gpu_priority_hint() const {
+    return gpu_priority_hint_;
+  }
+  inline GPUPerfHint gpu_perf_hint() const {
+    return gpu_perf_hint_;
+  }
+ private:
+  DeviceType device_type_;
+  int num_threads_;
+  std::shared_ptr<GPUContext> gpu_context_;
+  GPUPriorityHint gpu_priority_hint_;
+  GPUPerfHint gpu_perf_hint_;
+};
+MaceEngineConfig::Impl::Impl(const DeviceType device_type)
+    : device_type_(device_type),
+      num_threads_(-1),
+      gpu_context_(new GPUContext),
+      gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
+      gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
+MaceStatus MaceEngineConfig::Impl::SetGPUContext(
+    std::shared_ptr<GPUContext> context) {
+  gpu_context_ = context;
+  return MACE_SUCCESS;
+}
+MaceStatus MaceEngineConfig::Impl::SetGPUHints(
+    GPUPerfHint perf_hint,
+    GPUPriorityHint priority_hint) {
+  gpu_perf_hint_ = perf_hint;
+  gpu_priority_hint_ = priority_hint;
+  return MACE_SUCCESS;
+}
+MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
+    int num_threads,
+    CPUAffinityPolicy policy,
+    bool use_gemmlowp) {
+  num_threads_ = num_threads;
+  return mace::SetOpenMPThreadsAndAffinityPolicy(
+      num_threads, policy, use_gemmlowp);
+}
+MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity(
+    int num_threads,
+    const std::vector<int> &cpu_ids) {
+  num_threads_ = num_threads;
+  return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
+}
+MaceEngineConfig::MaceEngineConfig(
+    const DeviceType device_type)
+    : impl_(new MaceEngineConfig::Impl(device_type)) {}
+MaceEngineConfig::~MaceEngineConfig() = default;
+MaceStatus MaceEngineConfig::SetGPUContext(
+    std::shared_ptr<GPUContext> context) {
+  return impl_->SetGPUContext(context);
+}
+MaceStatus MaceEngineConfig::SetGPUHints(
+    GPUPerfHint perf_hint,
+    GPUPriorityHint priority_hint) {
+  return impl_->SetGPUHints(perf_hint, priority_hint);
+}
+MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
+    int num_threads_hint,
+    CPUAffinityPolicy policy,
+    bool use_gemmlowp) {
+  return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
+}
+MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity(
+    int num_threads,
+    const std::vector<int> &cpu_ids) {
+  return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids);
+}
+DeviceType MaceEngineConfig::device_type() const {
+  return impl_->device_type();
+}
+int MaceEngineConfig::num_threads() const {
+  return impl_->num_threads();
+}
+std::shared_ptr<GPUContext> MaceEngineConfig::gpu_context() const {
+  return impl_->gpu_context();
+}
+GPUPerfHint MaceEngineConfig::gpu_perf_hint() const {
+  return impl_->gpu_perf_hint();
+}
+GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const {
+  return impl_->gpu_priority_hint();
+}
 // Mace Tensor
 class MaceTensor::Impl {
 public:
@@ -155,7 +350,7 @@ std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
 // Mace Engine
 class MaceEngine::Impl {
 public:
-  explicit Impl(DeviceType device_type);
+  explicit Impl(const MaceEngineConfig &config);
  ~Impl();
@@ -178,6 +373,7 @@ class MaceEngine::Impl {
  size_t model_data_size_;
  std::shared_ptr<OperatorRegistryBase> op_registry_;
  DeviceType device_type_;
+  std::unique_ptr<Device> device_;
  std::unique_ptr<Workspace> ws_;
  std::unique_ptr<NetBase> net_;
  std::map<std::string, mace::InputInfo> input_info_map_;
@@ -189,11 +385,12 @@ class MaceEngine::Impl {
  MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
-MaceEngine::Impl::Impl(DeviceType device_type)
+MaceEngine::Impl::Impl(const MaceEngineConfig &config)
    : model_data_(nullptr),
      model_data_size_(0),
      op_registry_(new OperatorRegistry()),
-      device_type_(device_type),
+      device_type_(config.device_type()),
+      device_(nullptr),
      ws_(new Workspace()),
      net_(nullptr)
 #ifdef MACE_ENABLE_HEXAGON
@@ -201,6 +398,19 @@ MaceEngine::Impl::Impl(DeviceType device_type)
 #endif
 {
  LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
+  if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) {
+    device_.reset(new CPUDevice(config.num_threads()));
+  }
+#ifdef MACE_ENABLE_OPENCL
+  if (device_type_ == DeviceType::GPU) {
+    device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(),
+                                config.gpu_context()->opencl_cache_storage(),
+                                config.gpu_priority_hint(),
+                                config.gpu_perf_hint(),
+                                config.gpu_context()->opencl_binary_storage(),
+                                config.num_threads()));
+  }
+#endif
 }
 MaceStatus MaceEngine::Impl::Init(
@@ -212,7 +422,7 @@ MaceStatus MaceEngine::Impl::Init(
  // Check avalibility
 #ifdef MACE_ENABLE_OPENCL
  if (device_type_ == DeviceType::GPU) {
-    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def));
+    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
  }
 #endif
  // Get input and output information.
@@ -230,7 +440,7 @@ MaceStatus MaceEngine::Impl::Init(
                 << MakeString(MapKeys(input_info_map_));
    }
    ws_->CreateTensor(MakeString("mace_input_node_", input_name),
-                      GetDeviceAllocator(device_type_), DT_FLOAT);
+                      device_->allocator(), DT_FLOAT);
  }
  for (auto output_name : output_nodes) {
    if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -239,7 +449,7 @@ MaceStatus MaceEngine::Impl::Init(
                 << MakeString(MapKeys(output_info_map_));
    }
    ws_->CreateTensor(MakeString("mace_output_node_", output_name),
-                      GetDeviceAllocator(device_type_), DT_FLOAT);
+                      device_->allocator(), DT_FLOAT);
  }
 #ifdef MACE_ENABLE_HEXAGON
  if (device_type_ == HEXAGON) {
@@ -255,19 +465,20 @@ MaceStatus MaceEngine::Impl::Init(
    }
  } else {
 #endif
-    MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
+    MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
-        *net_def, device_type_, model_data));
+                                              device_.get(),
+                                              model_data));
    // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
+    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_.get(),
                         NetMode::INIT);
    MACE_RETURN_IF_ERROR(net->Run());
-    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
+    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_.get());
 #ifdef MACE_ENABLE_HEXAGON
  }
 #endif
  if (device_type_ == DeviceType::GPU) {
-    ws_->RemoveAndReloadBuffer(*net_def, model_data);
+    ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
  }
  return MaceStatus::MACE_SUCCESS;
 }
@@ -360,7 +571,7 @@ MaceStatus MaceEngine::Impl::Run(
 #ifdef MACE_ENABLE_OPENCL
  if (device_type_ == GPU) {
-    OpenCLRuntime::Global()->SaveBuiltCLProgram();
+    device_->opencl_runtime()->SaveBuiltCLProgram();
  }
 #endif
  for (auto &output : *outputs) {
@@ -385,8 +596,8 @@ MaceStatus MaceEngine::Impl::Run(
  return MACE_SUCCESS;
 }
-MaceEngine::MaceEngine(DeviceType device_type):
+MaceEngine::MaceEngine(const MaceEngineConfig &config):
-    impl_(new MaceEngine::Impl(device_type)) {}
+    impl_(new MaceEngine::Impl(config)) {}
 MaceEngine::~MaceEngine() = default;
@@ -421,7 +632,7 @@ MaceStatus CreateMaceEngineFromProto(
    const std::string &model_data_file,
    const std::vector<std::string> &input_nodes,
    const std::vector<std::string> &output_nodes,
-    const DeviceType device_type,
+    const MaceEngineConfig &config,
    std::shared_ptr<MaceEngine> *engine) {
  LOG(INFO) << "Create MaceEngine from model pb";
  // load model
@@ -432,7 +643,7 @@ MaceStatus CreateMaceEngineFromProto(
  std::shared_ptr<NetDef> net_def(new NetDef());
  net_def->ParseFromArray(&model_pb[0], model_pb.size());
-  engine->reset(new mace::MaceEngine(device_type));
+  engine->reset(new mace::MaceEngine(config));
  MaceStatus status = (*engine)->Init(
      net_def.get(), input_nodes, output_nodes, model_data_file);

--- a/mace/libmace/mace_runtime.cc
+++ b/mace/libmace/mace_runtime.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "mace/core/macros.h"
-#include "mace/core/file_storage.h"
-#include "mace/core/runtime/cpu/cpu_runtime.h"
-#include "mace/public/mace_runtime.h"
-#include "mace/utils/logging.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#endif  // MACE_ENABLE_OPENCL
-namespace mace {
-class FileStorageFactory::Impl {
- public:
-  explicit Impl(const std::string &path);
-  std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
- private:
-  std::string path_;
-};
-FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
-std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
-    const std::string &name) {
-  return std::move(std::unique_ptr<KVStorage>(
-      new FileStorage(path_ + "/" + name)));
-}
-FileStorageFactory::FileStorageFactory(const std::string &path):
-    impl_(new FileStorageFactory::Impl(path)) {}
-FileStorageFactory::~FileStorageFactory() = default;
-std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
-    const std::string &name) {
-  return impl_->CreateStorage(name);
-}
-extern std::shared_ptr<KVStorageFactory> kStorageFactory;
-void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory) {
-  VLOG(1) << "Set internal KV Storage Engine";
-  kStorageFactory = storage_factory;
-}
-// Set OpenCL Compiled Binary paths, just call once. (Not thread-safe)
-void SetOpenCLBinaryPaths(const std::vector<std::string> &paths) {
-#ifdef MACE_ENABLE_OPENCL
-  OpenCLRuntime::ConfigureOpenCLBinaryPath(paths);
-#else
-  MACE_UNUSED(paths);
-#endif  // MACE_ENABLE_OPENCL
-}
-extern std::string kOpenCLParameterPath;
-void SetOpenCLParameterPath(const std::string &path) {
-#ifdef MACE_ENABLE_OPENCL
-  kOpenCLParameterPath = path;
-#else
-  MACE_UNUSED(path);
-#endif  // MACE_ENABLE_OPENCL
-}
-void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) {
-#ifdef MACE_ENABLE_OPENCL
-  VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
-          << ", gpu_priority_hint: " << gpu_priority_hint;
-  OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
-#else
-  MACE_UNUSED(gpu_perf_hint);
-  MACE_UNUSED(gpu_priority_hint);
-#endif  // MACE_ENABLE_OPENCL
-}
-MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
-                                 CPUAffinityPolicy policy,
-                                 bool use_gemmlowp) {
-  VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint
-          << ", affinity policy: " << policy;
-  return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint,
-                                           policy,
-                                           use_gemmlowp);
-}
-MaceStatus SetOpenMPThreadAffinity(int num_threads,
-                                   const std::vector<int> &cpu_ids) {
-  return SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
-}
-MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                               std::vector<int> *little_core_ids) {
-  return GetCPUBigLittleCoreIDs(big_core_ids, little_core_ids);
-}
-};  // namespace mace
--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
 mace {
  global:
+    *GPUContextBuilder*;
+    *MaceEngineConfig*;
    *MaceTensor*;
    *MaceEngine*;
    *CreateMaceEngineFromProto*;
-    *FileStorageFactory*;
-    *SetKVStorageFactory*;
-    *SetOpenCLBinaryPaths*;
-    *SetOpenCLParameterPath*;
-    *SetGPUHints*;
-    *SetOpenMPThreadPolicy*;
-    *SetOpenMPThreadAffinity*;
    *GetBigLittleCoreIDs*;
    *MaceVersion*;

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -23,8 +23,25 @@ cc_library(
    hdrs = [
        "ops_test_util.h",
    ],
+    srcs = [
+        "ops_test_util.cc",
+    ],
+    copts = [
+        "-Werror",
+        "-Wextra",
+    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
    deps = [
-        "//mace/core",
+        "//mace/ops",
        "@gtest",
    ],
 )
@@ -36,6 +53,7 @@ cc_library(
        exclude = [
            "*_test.cc",
            "*_benchmark.cc",
+            "ops_test_util.cc",
            "buffer_to_image.cc",
            "image_to_buffer.cc",
            "lstmcell.cc",

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ActivationOp : public Operator<D, T> {
 public:
-  ActivationOp(const OperatorDef &operator_def, Workspace *ws)
+  ActivationOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(kernels::StringToActivationType(
+        functor_(context,
+                 kernels::StringToActivationType(
                     OperatorBase::GetOptionalArg<std::string>("activation",
                                                               "NOOP")),
                 static_cast<T>(

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -58,7 +58,7 @@ void TestSimpleRelu() {
    net.RunOp(D);
  }
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -106,7 +106,7 @@ void TestUnalignedSimpleRelu() {
    net.RunOp(D);
  }
-  auto expected = CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
+  auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -159,7 +159,7 @@ void TestSimpleRelux() {
    net.RunOp(D);
  }
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -209,7 +209,7 @@ void TestSimpleReluRelux() {
    net.RunOp(D);
  }
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -267,7 +267,7 @@ void TestSimplePrelu() {
  }
  if (D == DeviceType::CPU) {
-    auto expected = CreateTensor<float>(
+    auto expected = net.CreateTensor<float>(
        {2, 2, 2, 2},
        {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -318,7 +318,7 @@ void TestSimpleTanh() {
    net.RunOp(D);
  }
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2},
      {-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092,
       -0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758,
@@ -371,7 +371,7 @@ void TestSimpleSigmoid() {
    net.RunOp(D);
  }
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2},
      {9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01,
       6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01,

--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
 public:
-  AddNOp(const OperatorDef &operator_def, Workspace *ws)
+  AddNOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, context), functor_(context) {}
  MaceStatus Run(StatsFuture *future) override {
    Tensor *output_tensor = this->Output(0);

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -39,7 +39,7 @@ void SimpleAdd2() {
  // Run
  net.RunOp(D);
-  auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
+  auto expected = net.CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -98,7 +98,7 @@ void SimpleAdd3() {
  }
  auto expected =
-      CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
+      net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3);
 }
@@ -136,8 +136,8 @@ void RandomTest() {
    // run on cpu
    net.RunOp();
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    // run on gpu
    for (int i = 0; i < input_num; ++i) {
@@ -160,7 +160,7 @@ void RandomTest() {
    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                            kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-2);
  }
 }

--- a/mace/ops/argmax.h
+++ b/mace/ops/argmax.h
@@ -26,8 +26,8 @@ namespace ops {
 template<DeviceType D, class T>
 class ArgMaxOp : public Operator<D, T> {
 public:
-  ArgMaxOp(const OperatorDef &operator_def, Workspace *ws)
+  ArgMaxOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, context), functor_(context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
@@ -47,7 +47,7 @@ void ArgMaxTest(const std::vector<index_t> &input_shape,
  }
  // Check
-  auto expected = CreateTensor<int32_t>(output_shape, output);
+  auto expected = net.CreateTensor<int32_t>(output_shape, output);
  ExpectTensorNear<int32_t>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace

--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -25,9 +25,9 @@ namespace ops {
 template <DeviceType D, class T>
 class BatchNormOp : public Operator<D, T> {
 public:
-  BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
+  BatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(false, kernels::ActivationType::NOOP, 0.0f) {
+        functor_(context, false, kernels::ActivationType::NOOP, 0.0f) {
    epsilon_ = OperatorBase::GetOptionalArg<float>("epsilon",
                                                   static_cast<float>(1e-4));
  }
@@ -52,7 +52,8 @@ class BatchNormOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    return functor_(input, scale, offset, mean, var, epsilon_, output, future);
+    return functor_(input, scale, offset,
+                    mean, var, epsilon_, output, future);
  }
 private:

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -79,7 +79,7 @@ void Simple() {
  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
                     3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
@@ -130,8 +130,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -166,7 +166,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
@@ -208,8 +209,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -245,7 +246,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-1, 1e-2);
 }
 TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
@@ -287,8 +289,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -323,7 +325,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
@@ -365,8 +368,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -402,7 +405,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-1, 1e-2);
 }
 }  // namespace test

--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -27,9 +27,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class BatchToSpaceNDOp : public Operator<D, T> {
 public:
-  BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
+  BatchToSpaceNDOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
                 true) {}

--- a/mace/ops/bias_add.h
+++ b/mace/ops/bias_add.h
@@ -24,10 +24,11 @@ namespace ops {
 template <DeviceType D, class T>
 class BiasAddOp : public Operator<D, T> {
 public:
-  BiasAddOp(const OperatorDef &operator_def, Workspace *ws)
+  BiasAddOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
+        functor_(context,
-            "data_format", NHWC))) {}
+                 static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
+                     "data_format", NHWC))) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -66,7 +66,7 @@ void BiasAddSimple() {
  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 6, 2, 1},
      {5.5, 5.5, 7.5, 7.5, 9.5, 9.5, 11.5, 11.5, 13.5, 13.5, 15.5, 15.5});
@@ -111,8 +111,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -132,7 +132,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
@@ -167,8 +167,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -188,7 +188,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 }  // namespace test

--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class BufferToImageOp : public Operator<D, T> {
 public:
-  BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
+  BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

--- a/mace/ops/cast.h
+++ b/mace/ops/cast.h
@@ -25,8 +25,8 @@ namespace ops {
 template <DeviceType D, typename SrcType>
 class CastOp : public Operator<D, SrcType> {
 public:
-  CastOp(const OperatorDef &op_def, Workspace *ws)
+  CastOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, SrcType>(op_def, ws) {}
+      : Operator<D, SrcType>(op_def, context) {}
  MaceStatus Run(StatsFuture *future) override {
    MACE_UNUSED(future);

--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -26,10 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class ChannelShuffleOp : public Operator<D, T> {
 public:
-  ChannelShuffleOp(const OperatorDef &operator_def, Workspace *ws)
+  ChannelShuffleOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
        group_(OperatorBase::GetOptionalArg<int>("group", 1)),
-        functor_(this->group_) {}
+        functor_(context, this->group_) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -45,7 +45,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -77,7 +77,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
                                        kernels::BufferType::IN_OUT_CHANNEL);
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 1, 2, 16},
      {0,  4,  8,  12, 1,  5,  9,  13, 2,  6,  10, 14, 3,  7,  11, 15,
       16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31});

--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, typename T>
 class ConcatOp : public Operator<D, T> {
 public:
-  ConcatOp(const OperatorDef &op_def, Workspace *ws)
+  ConcatOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 3)) {}
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
  MaceStatus Run(StatsFuture *future) override {
    MACE_CHECK(this->InputSize() >= 2)

--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -28,9 +28,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class Conv2dOp : public ConvPool2dOpBase<D, T> {
 public:
-  Conv2dOp(const OperatorDef &op_def, Workspace *ws)
+  Conv2dOp(const OperatorDef &op_def, OpKernelContext *context)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
+      : ConvPool2dOpBase<D, T>(op_def, context),
-        functor_(this->strides_.data(),
+        functor_(context,
+                 this->strides_.data(),
                 this->padding_type_,
                 this->paddings_,
                 this->dilations_.data(),
@@ -40,7 +41,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
                 static_cast<bool>(OperatorBase::GetOptionalArg<int>(
                     "is_filter_transformed", false)),
-                 ws->GetScratchBuffer(D)) {}
+                 context->workspace()->GetScratchBuffer(D)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -84,7 +84,7 @@ void TestNHWCSimple3x3VALID() {
    MACE_NOT_IMPLEMENTED;
  }
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.1f});
  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() {
    MACE_NOT_IMPLEMENTED;
  }
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 3, 3, 1},
      {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
@@ -221,7 +221,7 @@ void TestNHWCSimple3x3WithoutBias() {
  }
  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.0f});
  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -298,7 +298,7 @@ void TestNHWCCombined3x3() {
  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
                     9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -374,7 +374,7 @@ void TestFusedNHWCSimple3x3VALID() {
    MACE_NOT_IMPLEMENTED;
  }
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
 template <DeviceType D, typename T>
@@ -434,7 +434,7 @@ void TestFusedNHWCSimple3x3WithoutBias() {
  }
  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
@@ -515,7 +515,7 @@ void TestConv1x1() {
  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 3, 10, 2},
      {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
@@ -576,8 +576,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
                                                    "Output", NHWC);
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -602,7 +602,7 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                        kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };
@@ -685,8 +685,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                                    "Output", NHWC);
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    // run on gpu
    BufferToImage<D, half>(&net, "Input", "InputImage",
@@ -712,7 +712,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                            kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-1);
  };
@@ -837,8 +837,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
                                                    "Output", NHWC);
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -863,7 +863,7 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                        kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };
@@ -934,8 +934,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                    "Output", NHWC);
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    // run on gpu
    BufferToImage<D, half>(&net, "Input", "InputImage",
@@ -960,7 +960,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                            kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-1);
  };
@@ -1021,8 +1021,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
                                                    "Output", NHWC);
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -1046,7 +1046,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                        kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };
@@ -1104,7 +1104,7 @@ void TestQuantSimple3x3() {
  // Run
  net.Run();
  // Check
-  auto expected = CreateTensor<uint8_t>({1, 1, 1, 1}, {230});
+  auto expected = net.CreateTensor<uint8_t>({1, 1, 1, 1}, {230});
  ExpectTensorNear<uint8_t>(*expected, *output);
 }

--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, class T>
 class ConvPool2dOpBase : public Operator<D, T> {
 public:
-  ConvPool2dOpBase(const OperatorDef &op_def, Workspace *ws)
+  ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
        strides_(OperatorBase::GetRepeatedArgs<int>("strides")),
        padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
            "padding", static_cast<int>(SAME)))),

--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -21,6 +21,8 @@ namespace test {
 TEST(CoreTest, INIT_MODE) {
  std::vector<OperatorDef> op_defs;
+  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
+  std::unique_ptr<Tuner<uint32_t>> tuner;
  Workspace ws;
  op_defs.emplace_back(OperatorDef());
@@ -31,7 +33,7 @@ TEST(CoreTest, INIT_MODE) {
      .AddIntArg("mode", static_cast<int>(NetMode::INIT))
      .Finalize(&op_defs[op_defs.size() - 1]);
-  Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
+  Tensor *input = ws.CreateTensor("Input", device->allocator(),
                                  DataTypeToEnum<float>::v());
  input->Resize({1, 3, 3, 3});
  {
@@ -53,13 +55,13 @@ TEST(CoreTest, INIT_MODE) {
  }
  std::shared_ptr<OperatorRegistryBase> op_registry(new OperatorRegistry());
  auto net =
-      CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT);
+      CreateNet(op_registry, net_def, &ws, device, NetMode::INIT);
  net->Run();
  EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
  EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
-  net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU);
+  net = CreateNet(op_registry, net_def, &ws, device);
  net->Run();
  EXPECT_TRUE(ws.GetTensor("Output") != nullptr);

--- a/mace/ops/crop.h
+++ b/mace/ops/crop.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class CropOp : public Operator<D, T> {
 public:
-  CropOp(const OperatorDef &op_def, Workspace *ws)
+  CropOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 2),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("axis", 2),
                 OperatorBase::GetRepeatedArgs<int>("offset")) {}
  MaceStatus Run(StatsFuture *future) override {

--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -75,7 +75,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
                                                    "Output", NHWC);
  }
  // Check
-  auto expected = CreateTensor<float>(expected_shape, expected_data);
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
 }  // namespace

--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class Deconv2dOp : public Operator<D, T> {
 public:
-  Deconv2dOp(const OperatorDef &op_def, Workspace *ws)
+  Deconv2dOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetRepeatedArgs<int>("strides"),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("strides"),
                 static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
                     "padding", static_cast<int>(SAME))),
                 OperatorBase::GetRepeatedArgs<int>("padding_values"),

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -79,7 +79,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                                                    "Output", NHWC);
  }
-  auto expected = CreateTensor<float>(expected_shape, expected_data);
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001);
 }
@@ -350,8 +350,8 @@ void TestComplexDeconvNxNS12(const int batch,
                                                    "Output", NHWC);
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
@@ -377,7 +377,7 @@ void TestComplexDeconvNxNS12(const int batch,
    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
                        kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };

--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
@@ -27,10 +27,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class DepthToSpaceOp : public Operator<D, T> {
 public:
-  DepthToSpaceOp(const OperatorDef &op_def, Workspace *ws)
+  DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
        block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)),
-        functor_(this->block_size_, true) {}
+        functor_(context, this->block_size_, true) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -64,7 +64,7 @@ void RunDepthToSpace(const bool d2s,
    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
                                          kernels::BufferType::IN_OUT_CHANNEL);
  }
-  auto expected = CreateTensor<float>(expected_shape, expected_data);
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace

--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -29,9 +29,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
 public:
-  DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws)
+  DepthwiseConv2dOp(const OperatorDef &op_def, OpKernelContext *context)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
+      : ConvPool2dOpBase<D, T>(op_def, context),
-        functor_(this->strides_.data(),
+        functor_(context,
+                 this->strides_.data(),
                 this->padding_type_,
                 this->paddings_,
                 this->dilations_.data(),

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -80,7 +80,7 @@ void SimpleValidTest() {
  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 2, 2, 2},
      {37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f});
@@ -212,7 +212,7 @@ void ComplexValidTest(index_t batch,
  }
  auto expected =
-      CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
+      net.CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
  if (DataTypeToEnum<T>::value == DT_FLOAT) {
    ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -284,8 +284,8 @@ void TestNxNS12(const index_t height, const index_t width) {
                                                    "Output", NHWC);
    // Check
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                      kernels::BufferType::IN_OUT_CHANNEL);
@@ -312,10 +312,10 @@ void TestNxNS12(const index_t height, const index_t width) {
    // Check
    if (DataTypeToEnum<T>::value == DT_FLOAT) {
-      ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
+      ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
                              1e-4);
    } else {
-      ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2,
+      ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-2,
                              1e-2);
    }
  };
@@ -387,7 +387,7 @@ void QuantSimpleValidTest() {
  net.Run();
  // Check
-  auto expected = CreateTensor<uint8_t>({1, 1, 1, 2}, {255, 21});
+  auto expected = net.CreateTensor<uint8_t>({1, 1, 1, 2}, {255, 21});
  ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"));
 }

--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class EltwiseOp : public Operator<D, T> {
 public:
-  EltwiseOp(const OperatorDef &op_def, Workspace *ws)
+  EltwiseOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
        functor_(
+            context,
            static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
                "type", static_cast<int>(kernels::EltwiseType::NONE))),
            OperatorBase::GetRepeatedArgs<float>("coeff"),

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -49,7 +49,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type,
    MACE_NOT_IMPLEMENTED;
  }
-  auto expected = CreateTensor<DstType>({}, {output});
+  auto expected = net.CreateTensor<DstType>({}, {output});
  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -97,7 +97,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
                              kernels::BufferType::IN_OUT_CHANNEL);
  }
-  auto expected = CreateTensor<DstType>(shape, output);
+  auto expected = net.CreateTensor<DstType>(shape, output);
  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -167,7 +167,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
  if (input0.size() < input1.size()) {
    output_shape = shape1;
  }
-  auto expected = CreateTensor<DstType>(output_shape, output);
+  auto expected = net.CreateTensor<DstType>(output_shape, output);
  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -206,7 +206,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
    MACE_NOT_IMPLEMENTED;
  }
-  auto expected = CreateTensor<DstType>(output_shape, output);
+  auto expected = net.CreateTensor<DstType>(output_shape, output);
  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
@@ -476,8 +476,8 @@ void RandomTensorScalar(const kernels::EltwiseType type,
  net.RunOp(DeviceType::CPU);
  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
                                                  NHWC);
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
                                    kernels::BufferType::IN_OUT_CHANNEL);
@@ -496,9 +496,9 @@ void RandomTensorScalar(const kernels::EltwiseType type,
                                        kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
  }
 }
@@ -531,8 +531,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
  net.RunOp(DeviceType::CPU);
  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
                                                  NHWC);
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
                                    kernels::BufferType::IN_OUT_CHANNEL);
@@ -554,9 +554,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
                                        kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
  }
 }
 }  // namespace

--- a/mace/ops/fill.h
+++ b/mace/ops/fill.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class FillOp : public Operator<D, T> {
 public:
-  FillOp(const OperatorDef &operator_def, Workspace *ws)
+  FillOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_() {}
+        functor_(context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *shape = this->Input(SHAPE);

--- a/mace/ops/folded_batch_norm.h
+++ b/mace/ops/folded_batch_norm.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, class T>
 class FoldedBatchNormOp : public Operator<D, T> {
 public:
-  FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws)
+  FoldedBatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(true,
+        functor_(context,
+                 true,
                 kernels::StringToActivationType(
                     OperatorBase::GetOptionalArg<std::string>("activation",
                                                               "NOOP")),

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -83,7 +83,7 @@ void Simple() {
  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
                     3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
@@ -129,8 +129,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -153,7 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
@@ -190,8 +191,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -215,7 +216,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-2, 1e-2);
 }
 TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
@@ -252,8 +254,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
@@ -275,7 +277,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-5, 1e-4);
 }
 TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
@@ -312,8 +315,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
                                                  NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
@@ -336,7 +339,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+                          1e-2, 1e-2);
 }
 }  // namespace test

--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class FullyConnectedOp : public Operator<D, T> {
 public:
-  FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
+  FullyConnectedOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(kernels::StringToActivationType(
+        functor_(context, kernels::StringToActivationType(
                     OperatorBase::GetOptionalArg<std::string>("activation",
                                                               "NOOP")),
                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -61,7 +61,8 @@ class FullyConnectedOp : public Operator<D, T> {
                 " don't match.");
    }
-    return functor_(input, weight, bias, output, future);
+    return functor_(input, weight,
+                    bias, output, future);
  }
 private:

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -76,7 +76,7 @@ void Simple(const std::vector<index_t> &input_shape,
  }
  // Check
-  auto expected = CreateTensor<float>(output_shape, output_value);
+  auto expected = net.CreateTensor<float>(output_shape, output_value);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -156,8 +156,8 @@ void Random(const index_t batch,
  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
  // Check
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  // Run on opencl
  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
@@ -181,10 +181,10 @@ void Random(const index_t batch,
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-1,
                            1e-1);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-3);
  }
 }

--- a/mace/ops/gather.h
+++ b/mace/ops/gather.h
@@ -24,9 +24,10 @@ namespace ops {
 template<DeviceType D, class T>
 class GatherOp : public Operator<D, T> {
 public:
-  GatherOp(const OperatorDef &operator_def, Workspace *ws)
+  GatherOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 0),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("axis", 0),
                 OperatorBase::GetOptionalArg<float>("y", 1.0)) {}
  MaceStatus Run(StatsFuture *future) override {

--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
@@ -47,7 +47,7 @@ void TestGather(const std::vector<index_t> &weight_shape,
  // Run
  net.RunOp(CPU);
-  auto expected = CreateTensor<float>(output_shape, output);
+  auto expected = net.CreateTensor<float>(output_shape, output);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

--- a/mace/ops/identity.h
+++ b/mace/ops/identity.h
@@ -25,8 +25,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class IdentityOp : public Operator<D, T> {
 public:
-  IdentityOp(const OperatorDef &op_def, Workspace *ws)
+  IdentityOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class ImageToBufferOp : public Operator<D, T> {
 public:
-  ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
+  ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/infer_conv2d_shape.h
+++ b/mace/ops/infer_conv2d_shape.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class InferConv2dShapeOp : public Operator<D, T> {
 public:
-  InferConv2dShapeOp(const OperatorDef &op_def, Workspace *ws)
+  InferConv2dShapeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/local_response_norm.h
+++ b/mace/ops/local_response_norm.h
@@ -24,8 +24,8 @@ namespace ops {
 template <DeviceType D, class T>
 class LocalResponseNormOp : public Operator<D, T> {
 public:
-  LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws)
+  LocalResponseNormOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws), functor_() {
+      : Operator<D, T>(operator_def, context), functor_(context) {
    depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5);
    bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f);
    alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f);

--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -46,7 +46,7 @@ void Simple() {
  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 1, 2, 6},
      {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47});

--- a/mace/ops/lstmcell.h
+++ b/mace/ops/lstmcell.h
@@ -26,10 +26,12 @@ namespace ops {
 template <DeviceType D, class T>
 class LSTMCellOp : public Operator<D, T> {
 public:
-  LSTMCellOp(const OperatorDef &op_def, Workspace *ws)
+  LSTMCellOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(static_cast<T>(
+        functor_(context,
-              OperatorBase::GetOptionalArg<float>("scalar_input", 0.0))) {}
+                 static_cast<T>(
+                     OperatorBase::GetOptionalArg<float>("scalar_input",
+                                                         0.0))) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
@@ -24,8 +24,9 @@ namespace ops {
 template <DeviceType D, class T>
 class MatMulOp : public Operator<D, T> {
 public:
-  MatMulOp(const OperatorDef &operator_def, Workspace *ws)
+  MatMulOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
+        functor_(context),
        transpose_a_(OperatorBase::GetOptionalArg<bool>("transpose_a", false)),
        transpose_b_(OperatorBase::GetOptionalArg<bool>("transpose_b", false)) {
  }
@@ -46,7 +47,8 @@ class MatMulOp : public Operator<D, T> {
    MACE_CHECK(ak == bk, "the number of A's column ", ak,
               " must be equal to B's row ", bk);
-    return functor_(A, B, C, transpose_a_, transpose_b_, future);
+    return functor_(A, B, C,
+                    transpose_a_, transpose_b_, future);
  }
 private:

--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -65,7 +65,7 @@ void Simple(const std::vector<index_t> &A_shape,
  }
  // Check
-  auto expected = CreateTensor<float>(C_shape, C_value);
+  auto expected = net.CreateTensor<float>(C_shape, C_value);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -171,15 +171,15 @@ void Complex(const std::vector<index_t> &batch,
  // Check
  EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape());
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
-  expected.Reshape({batch_count, height, out_width});
+  expected->Reshape({batch_count, height, out_width});
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-1);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5,
                            1e-5);
  }
 }

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/ops_test_util.h"
+namespace mace {
+namespace ops {
+namespace test {
+OpTestContext *OpTestContext::Get() {
+  static OpTestContext instance;
+  return &instance;
+}
+std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
+  return gpu_context_;
+}
+Device *OpTestContext::GetDevice(DeviceType device_type) {
+  return device_map_[device_type].get();
+}
+OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) {
+  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(new CPUDevice(-1));
+  device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
+      new GPUDevice(gpu_context_->opencl_tuner(),
+                    gpu_context_->opencl_cache_storage(),
+                    GPUPriorityHint::PRIORITY_NORMAL));
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -17,6 +17,7 @@
 #include <functional>
 #include <limits>
+#include <map>
 #include <memory>
 #include <random>
 #include <string>
@@ -26,7 +27,8 @@
 #include "gtest/gtest.h"
 #include "mace/core/net.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/core/device_context.h"
+#include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/kernels/opencl/common.h"
@@ -110,9 +112,28 @@ class OpDefBuilder {
  OperatorDef op_def_;
 };
+class OpTestContext {
+ public:
+  static OpTestContext *Get();
+  std::shared_ptr<GPUContext> gpu_context() const;
+  Device *GetDevice(DeviceType device_type);
+ private:
+  OpTestContext();
+  MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
+  std::shared_ptr<GPUContext> gpu_context_;
+  std::map<DeviceType, std::unique_ptr<Device>> device_map_;
+};
 class OpsTestNet {
 public:
-  OpsTestNet() : op_registry_(new OperatorRegistry()) {}
+  OpsTestNet() :
+    op_registry_(new OperatorRegistry()) {
+  }
+  ~OpsTestNet() {
+    Sync();
+  }
  template <DeviceType D, typename T>
  void AddInputFromArray(const std::string &name,
@@ -121,7 +142,8 @@ class OpsTestNet {
                         const float scale = 0.0,
                         const int32_t zero_point = 0) {
    Tensor *input =
-        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
+                         DataTypeToEnum<T>::v());
    input->Resize(shape);
    Tensor::MappingGuard input_mapper(input);
    T *input_data = input->mutable_data<T>();
@@ -136,7 +158,8 @@ class OpsTestNet {
                        const std::vector<index_t> &shape,
                        const T data) {
    Tensor *input =
-        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
+                         DataTypeToEnum<T>::v());
    input->Resize(shape);
    Tensor::MappingGuard input_mapper(input);
    T *input_data = input->mutable_data<T>();
@@ -149,7 +172,8 @@ class OpsTestNet {
                      bool positive = true,
                      bool truncate = false) {
    Tensor *input =
-        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
+                         DataTypeToEnum<T>::v());
    input->Resize(shape);
    Tensor::MappingGuard input_mapper(input);
    T *input_data = input->mutable_data<T>();
@@ -184,8 +208,10 @@ class OpsTestNet {
  template <DeviceType D, typename T>
  void Transpose2D(const std::string &src_name, const std::string &dst_name) {
    Tensor *input = ws_.GetTensor(src_name);
-    Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
+    Tensor *output = ws_.CreateTensor(
-                                      DataTypeToEnum<T>::v());
+        dst_name,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
    const std::vector<index_t> input_shape = input->shape();
    MACE_CHECK(input_shape.size() == 2, "input shape != 2");
    output->Resize({input_shape[1], input_shape[0]});
@@ -205,8 +231,10 @@ class OpsTestNet {
  void CopyData(const std::string &src_name,
                const std::string &dst_name) {
    Tensor *input = ws_.GetTensor(src_name);
-    Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
+    Tensor *output = ws_.CreateTensor(
-                                      DataTypeToEnum<T>::v());
+        dst_name,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
    const std::vector<index_t> input_shape = input->shape();
    output->Resize(input_shape);
@@ -222,8 +250,10 @@ class OpsTestNet {
                           const std::string &dst_name,
                           const DataFormat dst_format) {
    Tensor *input = ws_.GetTensor(src_name);
-    Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
+    Tensor *output = ws_.CreateTensor(
-                                      DataTypeToEnum<T>::v());
+        dst_name,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
    const std::vector<index_t> input_shape = input->shape();
    MACE_CHECK(input_shape.size() == 4, "input shape != 4");
@@ -352,8 +382,10 @@ class OpsTestNet {
  void FillNHWCInputToNCHWInput(const std::string &name_nchw,
                                const std::string &name_nhwc) {
    Tensor *input = ws_.GetTensor(name_nhwc);
-    Tensor *output = ws_.CreateTensor(name_nchw, GetDeviceAllocator(D),
+    Tensor *output = ws_.CreateTensor(
-                                      DataTypeToEnum<T>::v());
+        name_nchw,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
    const std::vector<index_t> input_shape = input->shape();
    index_t batch = input_shape[0];
    index_t height = input_shape[1];
@@ -374,6 +406,22 @@ class OpsTestNet {
    }
  }
+  // Create standalone tensor on device D with T type.
+  template <typename T, DeviceType D = DeviceType::CPU>
+  std::unique_ptr<Tensor> CreateTensor(
+      const std::vector<index_t> &shape = {},
+      const std::vector<T> &data = {}) {
+    std::unique_ptr<Tensor> res(
+        new Tensor(OpTestContext::Get()->GetDevice(D)->allocator(),
+                   DataTypeToEnum<T>::v()));
+    if (!data.empty()) {
+      res->Resize(shape);
+      T *input_data = res->mutable_data<T>();
+      memcpy(input_data, data.data(), data.size() * sizeof(T));
+    }
+    return res;
+  }
  OperatorDef *NewOperatorDef() {
    op_defs_.clear();
    op_defs_.emplace_back(OperatorDef());
@@ -392,8 +440,9 @@ class OpsTestNet {
    for (auto &op_def_ : op_defs_) {
      net_def.add_op()->CopyFrom(op_def_);
    }
-    net_ = CreateNet(op_registry_, net_def, &ws_, device);
+    net_ = CreateNet(op_registry_, net_def, &ws_,
-    device_ = device;
+                     OpTestContext::Get()->GetDevice(device));
+    device_type_ = device;
    return net_ != nullptr;
  }
@@ -416,10 +465,15 @@ class OpsTestNet {
  MaceStatus RunOp() { return RunOp(DeviceType::CPU); }
  MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
-    device_ = device;
+    device_type_ = device;
-    net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT);
+    net_ = CreateNet(op_registry_,
+                     net_def,
+                     &ws_,
+                     OpTestContext::Get()->GetDevice(device),
+                     NetMode::INIT);
    MACE_RETURN_IF_ERROR(net_->Run());
-    net_ = CreateNet(op_registry_, net_def, &ws_, device);
+    net_ = CreateNet(op_registry_, net_def, &ws_,
+                     OpTestContext::Get()->GetDevice(device));
    return net_->Run();
  }
@@ -432,9 +486,12 @@ class OpsTestNet {
  }
  void Sync() {
-    if (net_ && device_ == DeviceType::GPU) {
+#ifdef MACE_ENABLE_OPENCL
-      OpenCLRuntime::Global()->command_queue().finish();
+    if (net_ && device_type_ == DeviceType::GPU) {
+      OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime()
+          ->command_queue().finish();
    }
+#endif
  }
 public:
@@ -442,17 +499,17 @@ class OpsTestNet {
  Workspace ws_;
  std::vector<OperatorDef> op_defs_;
  std::unique_ptr<NetBase> net_;
-  DeviceType device_;
+  DeviceType device_type_;
 };
 class OpsTestBase : public ::testing::Test {
 protected:
  virtual void SetUp() {
-    // OpenCLRuntime::CreateGlobal();
+    SetOpenMPThreadsAndAffinityPolicy(-1,
+                                      CPUAffinityPolicy::AFFINITY_BIG_ONLY);
  }
  virtual void TearDown() {
-    // OpenCLRuntime::DestroyGlobal();
  }
 };
@@ -510,17 +567,6 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
  return std::move(dest);
 }
-template <typename T>
-std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
-                                     const std::vector<T> &data) {
-  std::unique_ptr<Tensor> res(
-      new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
-  res->Resize(shape);
-  T *input_data = res->mutable_data<T>();
-  memcpy(input_data, data.data(), data.size() * sizeof(T));
-  return res;
-}
 inline bool IsSameSize(const Tensor &x, const Tensor &y) {
  if (x.dim_size() != y.dim_size()) return false;
  for (int d = 0; d < x.dim_size(); ++d) {

--- a/mace/ops/pad.h
+++ b/mace/ops/pad.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, class T>
 class PadOp : public Operator<D, T> {
 public:
-  PadOp(const OperatorDef &operator_def, Workspace *ws)
+  PadOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetRepeatedArgs<int>("paddings"),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("paddings"),
                 OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {}
  MaceStatus Run(StatsFuture *future) override {

--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -63,7 +63,7 @@ void Simple() {
  auto output = net.GetTensor("Output");
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 5, 6, 1}, {
                        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2,   2,   2,
                        1.0, 1.0, 1.0, 2,   2,   2,   1.0, 1.0, 1.0, 1.0,
@@ -99,7 +99,7 @@ TEST_F(PadTest, ComplexCPU) {
  auto output = net.GetTensor("Output");
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 3, 3, 4},
      {
          1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
@@ -134,8 +134,8 @@ void Complex(const std::vector<index_t> &input_shape,
  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
                                                  NHWC);
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                    kernels::BufferType::IN_OUT_CHANNEL);
@@ -155,9 +155,9 @@ void Complex(const std::vector<index_t> &input_shape,
  auto output = net.GetTensor("OpenCLOutput");
  if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(expected, *output, 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *output, 1e-2, 1e-2);
  } else {
-    ExpectTensorNear<float>(expected, *output, 1e-5);
+    ExpectTensorNear<float>(*expected, *output, 1e-5);
  }
 }
 }  // namespace

--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -27,13 +27,14 @@ namespace ops {
 template <DeviceType D, class T>
 class PoolingOp : public ConvPool2dOpBase<D, T> {
 public:
-  PoolingOp(const OperatorDef &op_def, Workspace *ws)
+  PoolingOp(const OperatorDef &op_def, OpKernelContext *context)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
+      : ConvPool2dOpBase<D, T>(op_def, context),
        kernels_(OperatorBase::GetRepeatedArgs<int>("kernels")),
        pooling_type_(
            static_cast<PoolingType>(OperatorBase::GetOptionalArg<int>(
                "pooling_type", static_cast<int>(AVG)))),
-        functor_(pooling_type_,
+        functor_(context,
+                 pooling_type_,
                 kernels_.data(),
                 this->strides_.data(),
                 this->padding_type_,

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -57,7 +57,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {
  // Check
  auto expected =
-      CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
+      net.CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -90,7 +90,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
+  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -124,7 +124,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
+  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -158,7 +158,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
+  auto expected = net.CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -209,7 +209,7 @@ void SimpleMaxPooling3S2() {
  }
  // Check
-  auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
+  auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -249,8 +249,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                  NHWC);
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  BufferToImage<D, T>(&net, "Input", "InputImage",
                      kernels::BufferType::IN_OUT_CHANNEL);
@@ -269,10 +269,10 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
                          kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
                            1e-4);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
  }
 }
 }  // namespace
@@ -334,7 +334,7 @@ TEST_F(PoolingOpTest, AVG_VALID) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -368,7 +368,7 @@ void SimpleAvgPoolingTest() {
                          kernels::BufferType::IN_OUT_CHANNEL);
  // Check
-  auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
+  auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -407,8 +407,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                  NHWC);
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  BufferToImage<D, T>(&net, "Input", "InputImage",
                      kernels::BufferType::IN_OUT_CHANNEL);
@@ -427,10 +427,10 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
                          kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
                            1e-3);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
  }
 }
 }  // namespace
@@ -503,7 +503,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_VALID) {
  // Check
  auto expected =
-      CreateTensor<uint8_t>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
+      net.CreateTensor<uint8_t>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
  ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -531,7 +531,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_SAME) {
  net.RunOp();
  // Check
-  auto expected = CreateTensor<uint8_t>({1, 2, 2, 1}, {4, 5, 7, 8});
+  auto expected = net.CreateTensor<uint8_t>({1, 2, 2, 1}, {4, 5, 7, 8});
  ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -561,7 +561,7 @@ TEST_F(PoolingOpTest, QUANT_AVG_VALID) {
  net.RunOp();
  // Check
-  auto expected = CreateTensor<uint8_t>(
+  auto expected = net.CreateTensor<uint8_t>(
      {1, 2, 2, 2}, {3, 19, 5, 21, 11, 27, 13, 29});
  ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);

--- a/mace/ops/proposal.h
+++ b/mace/ops/proposal.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ProposalOp : public Operator<D, T> {
 public:
-  ProposalOp(const OperatorDef &operator_def, Workspace *ws)
+  ProposalOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("min_size", 16),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("min_size", 16),
                 OperatorBase::GetOptionalArg<float>("nms_thresh", 0.7),
                 OperatorBase::GetOptionalArg<int>("pre_nms_top_n", 6000),
                 OperatorBase::GetOptionalArg<int>("post_nms_top_n", 300),

--- a/mace/ops/proposal_test.cc
+++ b/mace/ops/proposal_test.cc
@@ -60,7 +60,8 @@ TEST_F(ProposalOpTest, CPUSimple) {
  // Run
  net.RunOp();
-  auto expected_tensor = CreateTensor<float>({1, 1, 1, 5}, {0, 0, 0, 255, 255});
+  auto expected_tensor = net.CreateTensor<float>({1, 1, 1, 5},
+                                                 {0, 0, 0, 255, 255});
  ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5);
 }

--- a/mace/ops/quantize.h
+++ b/mace/ops/quantize.h
@@ -24,8 +24,9 @@ namespace ops {
 template<DeviceType D, class T>
 class QuantizeOp : public Operator<D, T> {
 public:
-  QuantizeOp(const OperatorDef &operator_def, Workspace *ws)
+  QuantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
+        functor_(context),
        non_zero_(
            static_cast<bool>(OperatorBase::GetOptionalArg<int>("non_zero",
                                                                0))) {}
@@ -50,8 +51,8 @@ class QuantizeOp : public Operator<D, T> {
 template<DeviceType D, class T>
 class DequantizeOp : public Operator<D, T> {
 public:
-  DequantizeOp(const OperatorDef &operator_def, Workspace *ws)
+  DequantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, context), functor_(context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/reduce_mean.h
+++ b/mace/ops/reduce_mean.h
@@ -27,9 +27,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ReduceMeanOp : public Operator<D, T> {
 public:
-  ReduceMeanOp(const OperatorDef &operator_def, Workspace *ws)
+  ReduceMeanOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetRepeatedArgs<int>("axis"),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("axis"),
                 OperatorBase::GetOptionalArg<bool>("keepdims", false)) {}
  MaceStatus Run(StatsFuture *future) override {

--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
@@ -57,7 +57,7 @@ void Simple(const std::vector<index_t> &input_shape,
    ImageToBuffer<D, float>(&net, "OutputImg", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  }
-  auto expected = CreateTensor<float>(output_shape, output);
+  auto expected = net.CreateTensor<float>(output_shape, output);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
 }

--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
@@ -26,8 +26,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class ReshapeOp : public Operator<D, T> {
 public:
-  ReshapeOp(const OperatorDef &op_def, Workspace *ws)
+  ReshapeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, context), functor_(context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ResizeBicubicOp : public Operator<D, T> {
 public:
-  ResizeBicubicOp(const OperatorDef &operator_def, Workspace *ws)
+  ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
                 OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
  MaceStatus Run(StatsFuture *future) override {

--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -48,7 +48,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
@@ -77,7 +77,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 2, 3, 3},
+  auto expected = net.CreateTensor<float>({1, 2, 3, 3},
      {0., 1., 2., 4.110297, 5.110297, 6.110297,
       8.223037, 9.223036, 10.223037, 24., 25., 26.,
       28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038});
@@ -110,7 +110,7 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }

--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class ResizeBilinearOp : public Operator<D, T> {
 public:
-  ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws)
+  ResizeBilinearOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
                 OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
  MaceStatus Run(StatsFuture *future) override {

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -48,7 +48,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -78,7 +78,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
                                                  NHWC);
  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -116,8 +116,8 @@ void TestRandomResizeBilinear() {
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                    "Output", NHWC);
-    Tensor expected;
+    auto expected = net.CreateTensor<float>();
-    expected.Copy(*net.GetOutput("Output"));
+    expected->Copy(*net.GetOutput("Output"));
    if (D == DeviceType::GPU) {
      BufferToImage<D, float>(&net, "Input", "InputImage",
@@ -136,7 +136,7 @@ void TestRandomResizeBilinear() {
                              kernels::BufferType::IN_OUT_CHANNEL);
    }
    // Check
-    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
                            1e-6);
  }
 }

--- a/mace/ops/scalar_math.h
+++ b/mace/ops/scalar_math.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class ScalarMathOp : public Operator<D, T> {
 public:
-  ScalarMathOp(const OperatorDef &op_def, Workspace *ws)
+  ScalarMathOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(static_cast<kernels::EltwiseType>(
+        functor_(context,
+                 static_cast<kernels::EltwiseType>(
                   OperatorBase::GetOptionalArg<int>(
                       "type", static_cast<int>(kernels::EltwiseType::NONE))),
                 OperatorBase::GetRepeatedArgs<float>("coeff"),

--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
@@ -49,60 +49,60 @@ void ScalarMathTest(const kernels::EltwiseType type,
  net.RunOp(D);
-  auto expected = CreateTensor<DstType>({}, {output});
+  auto expected = net.CreateTensor<DstType>({}, {output});
  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
 TEST_F(ScalarMathOpTest, SimpleCPU) {
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::SUM, 1, 2, 3, 3);
+      kernels::EltwiseType::SUM, 1, 2, 3, 3);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::SUB, 1, 2, 3, -1);
+      kernels::EltwiseType::SUB, 1, 2, 3, -1);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::PROD, 3, -2, 3, -6);
+      kernels::EltwiseType::PROD, 3, -2, 3, -6);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+      kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::MIN, 3, -2, 1, -2);
+      kernels::EltwiseType::MIN, 3, -2, 1, -2);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::MAX, 3, -2, 1, 3);
+      kernels::EltwiseType::MAX, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::NEG, 3, -2, 1, -3);
+      kernels::EltwiseType::NEG, 3, -2, 1, -3);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::ABS, 3, -2, 1, 3);
+      kernels::EltwiseType::ABS, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+      kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
-ScalarMathTest<DeviceType::CPU, float, float>(
+  ScalarMathTest<DeviceType::CPU, float, float>(
-    kernels::EltwiseType::POW, 3, 1, 1, 3);
+      kernels::EltwiseType::POW, 3, 1, 1, 3);
-ScalarMathTest<DeviceType::CPU, float, int32_t>(
+  ScalarMathTest<DeviceType::CPU, float, int32_t>(
-    kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+      kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
 }
 TEST_F(ScalarMathOpTest, SimpleGPU) {
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::SUM, 1, 2, 1, 3);
+      kernels::EltwiseType::SUM, 1, 2, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::SUB, 1, 2, 1, -1);
+      kernels::EltwiseType::SUB, 1, 2, 1, -1);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::PROD, 3, -2, 1, -6);
+      kernels::EltwiseType::PROD, 3, -2, 1, -6);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+      kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::MIN, 3, -2, 1, -2);
+      kernels::EltwiseType::MIN, 3, -2, 1, -2);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::MAX, 3, -2, 1, 3);
+      kernels::EltwiseType::MAX, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::NEG, 3, -2, 1, -3);
+      kernels::EltwiseType::NEG, 3, -2, 1, -3);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::ABS, 3, -2, 1, 3);
+      kernels::EltwiseType::ABS, 3, -2, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+      kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
-ScalarMathTest<DeviceType::GPU, float, float>(
+  ScalarMathTest<DeviceType::GPU, float, float>(
-    kernels::EltwiseType::POW, 3, 1, 1, 3);
+      kernels::EltwiseType::POW, 3, 1, 1, 3);
-ScalarMathTest<DeviceType::GPU, float, int32_t>(
+  ScalarMathTest<DeviceType::GPU, float, int32_t>(
-    kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+      kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
 }
 }  // namespace test
 }  // namespace ops

--- a/mace/ops/shape.h
+++ b/mace/ops/shape.h
@@ -25,8 +25,8 @@ namespace ops {
 template <DeviceType D, typename T>
 class ShapeOp : public Operator<D, T> {
 public:
-  ShapeOp(const OperatorDef &op_def, Workspace *ws)
+  ShapeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/softmax.h
+++ b/mace/ops/softmax.h
@@ -24,8 +24,9 @@ namespace ops {
 template <DeviceType D, class T>
 class SoftmaxOp : public Operator<D, T> {
 public:
-  SoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
+  SoftmaxOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, context),
+        functor_(context) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *logits = this->Input(LOGITS);

--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -29,7 +29,7 @@ void Simple() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
                                  {1, 1, 1, 1, 1, 2, 3, 4});
-  auto expected = CreateTensor<float>(
+  auto expected = net.CreateTensor<float>(
      {1, 1, 2, 4},
      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
@@ -113,8 +113,8 @@ void Complex(const std::vector<index_t> &logits_shape) {
    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
  }
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("Output"));
+  expected->Copy(*net.GetOutput("Output"));
  BufferToImage<D, float>(&net, "Input", "InputImage",
                          kernels::BufferType::IN_OUT_CHANNEL);
@@ -131,7 +131,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
  ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
                          kernels::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 }  // namespace

--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -27,9 +27,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class SpaceToBatchNDOp : public Operator<D, T> {
 public:
-  SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
+  SpaceToBatchNDOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
+        functor_(context,
+                 OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
                 false) {}

--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -116,24 +116,23 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
                                const std::vector<int> &padding_data,
                                const std::vector<index_t> &batch_shape,
                                const std::vector<float> &batch_data) {
-  auto space_tensor = std::unique_ptr<Tensor>(
+  OpsTestNet net;
-      new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
+  auto space_tensor = net.CreateTensor<T, GPU>();
  space_tensor->Resize(space_shape);
  {
    Tensor::MappingGuard space_mapper(space_tensor.get());
-    T *space_ptr = space_tensor->mutable_data<T>();
+    T *space_ptr = space_tensor->template mutable_data<T>();
    MACE_CHECK(static_cast<size_t>(space_tensor->size()) == space_data.size())
        << "Space tensor size:" << space_tensor->size()
        << ", space data size:" << space_data.size();
    memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T));
  }
-  auto batch_tensor = std::unique_ptr<Tensor>(
+  auto batch_tensor = net.CreateTensor<T, GPU>();
-      new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
  batch_tensor->Resize(batch_shape);
  {
    Tensor::MappingGuard batch_mapper(batch_tensor.get());
-    T *batch_ptr = batch_tensor->mutable_data<T>();
+    T *batch_ptr = batch_tensor->template mutable_data<T>();
    MACE_CHECK(static_cast<size_t>(batch_tensor->size()) == batch_data.size());
    memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T));
  }

--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
@@ -27,9 +27,11 @@ namespace ops {
 template <DeviceType D, typename T>
 class SpaceToDepthOp : public Operator<D, T> {
 public:
-  SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws)
+  SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("block_size", 1), false) {}
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("block_size", 1),
+                 false) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/split.h
+++ b/mace/ops/split.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, typename T>
 class SplitOp : public Operator<D, T> {
 public:
-  SplitOp(const OperatorDef &op_def, Workspace *ws)
+  SplitOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 3)) {}
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
  MaceStatus Run(StatsFuture *future) override {
    MACE_CHECK(this->OutputSize() >= 2)

--- a/mace/ops/squeeze.h
+++ b/mace/ops/squeeze.h
@@ -26,8 +26,8 @@ namespace ops {
 template<DeviceType D, typename T>
 class SqueezeOp : public Operator<D, T> {
 public:
-  SqueezeOp(const OperatorDef &op_def, Workspace *ws)
+  SqueezeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
        axis_(OperatorBase::GetRepeatedArgs<int>("axis", {})) {}
  MaceStatus Run(StatsFuture *future) override {

--- a/mace/ops/stack.h
+++ b/mace/ops/stack.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class StackOp : public Operator<D, T> {
 public:
-  StackOp(const OperatorDef &operator_def, Workspace *ws)
+  StackOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 0)) {}
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
  MaceStatus Run(StatsFuture *future) override {
    const std::vector<const Tensor *> &inputs = this->Inputs();

--- a/mace/ops/strided_slice.h
+++ b/mace/ops/strided_slice.h
@@ -24,9 +24,10 @@ namespace ops {
 template <DeviceType D, class T>
 class StridedSliceOp : public Operator<D, T> {
 public:
-  StridedSliceOp(const OperatorDef &operator_def, Workspace *ws)
+  StridedSliceOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("begin_mask", 0),
+        functor_(context,
+                 OperatorBase::GetOptionalArg<int>("begin_mask", 0),
                 OperatorBase::GetOptionalArg<int>("end_mask", 0),
                 OperatorBase::GetOptionalArg<int>("ellipsis_mask", 0),
                 OperatorBase::GetOptionalArg<int>("new_axis_mask", 0),

--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
@@ -26,10 +26,10 @@ namespace mace {
 template <DeviceType D, class T>
 class TransposeOp : public Operator<D, T> {
 public:
-  TransposeOp(const OperatorDef &operator_def, Workspace *ws)
+  TransposeOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
        dims_(OperatorBase::GetRepeatedArgs<int>("dims")),
-        functor_(dims_) {}
+        functor_(context, dims_) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/unstack.h
+++ b/mace/ops/unstack.h
@@ -26,9 +26,9 @@ namespace ops {
 template <DeviceType D, class T>
 class UnstackOp : public Operator<D, T> {
 public:
-  UnstackOp(const OperatorDef &operator_def, Workspace *ws)
+  UnstackOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, ws),
+      : Operator<D, T>(operator_def, context),
-        functor_(OperatorBase::GetOptionalArg<int>("axis", 0)) {}
+        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -64,9 +64,10 @@ void WinogradConvolution(const index_t batch,
  // Transfer output
  ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
                          kernels::BufferType::IN_OUT_CHANNEL);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("ConvOutput"));
+  auto expected = net.CreateTensor<float>();
-  auto output_shape = expected.shape();
+  expected->Copy(*net.GetOutput("ConvOutput"));
+  auto output_shape = expected->shape();
  // Winograd convolution
  // transform filter
@@ -124,9 +125,11 @@ void WinogradConvolution(const index_t batch,
  ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
                          kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-2, 1e-2);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-5, 1e-4);
  }
 }
 }  // namespace
@@ -212,9 +215,9 @@ void WinogradConvolutionWithPad(const index_t batch,
  // Transfer output
  ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
                          kernels::BufferType::IN_OUT_CHANNEL);
-  Tensor expected;
+  auto expected = net.CreateTensor<float>();
-  expected.Copy(*net.GetOutput("ConvOutput"));
+  expected->Copy(*net.GetOutput("ConvOutput"));
-  auto output_shape = expected.shape();
+  auto output_shape = expected->shape();
  // Winograd convolution
  // transform filter
@@ -272,9 +275,11 @@ void WinogradConvolutionWithPad(const index_t batch,
  ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
                          kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-2, 1e-2);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
+                            1e-5, 1e-4);
  }
 }
 }  // namespace

--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -29,9 +29,11 @@ namespace ops {
 template <DeviceType D, typename T>
 class WinogradInverseTransformOp : public Operator<D, T> {
 public:
-  WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws)
+  WinogradInverseTransformOp(const OperatorDef &op_def,
-      : Operator<D, T>(op_def, ws),
+                             OpKernelContext *context)
-        functor_(kernels::StringToActivationType(
+      : Operator<D, T>(op_def, context),
+        functor_(context,
+                 kernels::StringToActivationType(
                     OperatorBase::GetOptionalArg<std::string>("activation",
                                                               "NOOP")),
                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),

--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
@@ -26,9 +26,10 @@ namespace ops {
 template <DeviceType D, typename T>
 class WinogradTransformOp : public Operator<D, T> {
 public:
-  WinogradTransformOp(const OperatorDef &op_def, Workspace *ws)
+  WinogradTransformOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, ws),
+      : Operator<D, T>(op_def, context),
-        functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
+        functor_(context,
+                 static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
                     "padding", static_cast<int>(VALID))),
                 OperatorBase::GetRepeatedArgs<int>("padding_values"),
                 OperatorBase::GetOptionalArg<int>(

--- a/mace/public/BUILD
+++ b/mace/public/BUILD
@@ -11,7 +11,6 @@ cc_library(
    name = "public",
    hdrs = [
        "mace.h",
-        "mace_runtime.h",
    ],
    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
 )
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -24,12 +24,36 @@
 #include <string>
 #include <vector>
+#ifndef MACE_API
+#define MACE_API __attribute__((visibility("default")))
+#endif
 namespace mace {
 class NetDef;
 enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
+enum GPUPerfHint {
+  PERF_DEFAULT = 0,
+  PERF_LOW = 1,
+  PERF_NORMAL = 2,
+  PERF_HIGH = 3
+};
+enum GPUPriorityHint {
+  PRIORITY_DEFAULT = 0,
+  PRIORITY_LOW = 1,
+  PRIORITY_NORMAL = 2,
+  PRIORITY_HIGH = 3
+};
+enum CPUAffinityPolicy {
+  AFFINITY_NONE = 0,
+  AFFINITY_BIG_ONLY = 1,
+  AFFINITY_LITTLE_ONLY = 2,
+};
 struct CallStats {
  int64_t start_micros;
  int64_t end_micros;
@@ -73,14 +97,167 @@ enum MaceStatus {
    }                                                                      \
  }
+/// \brief Get ARM big.LITTLE configuration.
+///
+/// This function will detect the max frequencies of all CPU cores, and assume
+/// the cores with largest max frequencies as big cores, and all the remaining
+/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
+/// little_core_ids will both be filled with all cpu core ids.
+///
+/// \param [out] big_core_ids
+/// \param [out] little_core_ids
+/// \return If successful, it returns MACE_SUCCESS and error if it can't
+///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
+MACE_API MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
+                                        std::vector<int> *little_core_ids);
+/// \brief GPU context contain the status used for GPU device.
+///
+/// The life cycle of GPUContext object is the same as MaceEngines use it.
+/// Just use one GPUContext for all MaceEngines, which will speed up the
+/// initialization procedure. There are some data in common between different
+/// MaceEngines using GPU, use one GPUContext could avoid duplication.
+class GPUContext;
+/// \brief GPUContext builder.
+///
+/// Use the GPUContextBuilder to generate GPUContext.
+/// Not thread-safe
+class MACE_API GPUContextBuilder {
+ public:
+  GPUContextBuilder();
+  ~GPUContextBuilder();
+  GPUContextBuilder(const GPUContextBuilder &) = delete;
+  GPUContextBuilder(const GPUContextBuilder &&) = delete;
+  GPUContextBuilder &operator=(const GPUContextBuilder &) = delete;
+  GPUContextBuilder &operator=(const GPUContextBuilder &&) = delete;
+  /// \brief Set internal storage factory to store internal data.
+  ///
+  /// Now the path is used to store the built OpenCL binaries to file,
+  /// which could speed up the GPU initialization and first run.
+  /// If do not call this API, the initialization maybe slow for GPU.
+  ///
+  /// \param path  Make sure your program have Read/Write permission of the path
+  /// \return
+  GPUContextBuilder &SetStoragePath(const std::string &path);
+  /// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)  // NOLINT(whitespace/line_length)
+  ///
+  /// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization.  // NOLINT(whitespace/line_length)
+  /// OpenCL binary is corresponding to the OpenCL Driver version,
+  /// you should update the binary when OpenCL Driver changed.
+  ///
+  /// \param paths MACE will use first file found in all paths
+  /// \return
+  GPUContextBuilder &SetOpenCLBinaryPaths(
+      const std::vector<std::string> &paths);
+  /// \brief Set the path of Generated OpenCL parameter file
+  ///
+  /// If you use gpu for specific soc, The parameters is the local work group
+  /// size tuned for specific SOC, which may be faster than the
+  /// general parameters.
+  ///
+  /// \param path Make sure your program have Read/Write permission of the path
+  /// \return
+  GPUContextBuilder &SetOpenCLParameterPath(const std::string &path);
+  std::shared_ptr<GPUContext> Finalize();
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+class MACE_API MaceEngineConfig {
+ public:
+  explicit MaceEngineConfig(const DeviceType device_type);
+  ~MaceEngineConfig();
+  MaceEngineConfig(const MaceEngineConfig &) = delete;
+  MaceEngineConfig(const MaceEngineConfig &&) = delete;
+  MaceEngineConfig &operator=(const MaceEngineConfig &) = delete;
+  MaceEngineConfig &operator=(const MaceEngineConfig &&) = delete;
+  /// \brief Set GPUContext
+  ///
+  /// Just use one GPUContext for multiple models run on GPU.
+  /// \param context created use GPUContextBuilder
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetGPUContext(std::shared_ptr<GPUContext> context);
+  /// \brief Set GPU hints, currently only supports Adreno GPU.
+  ///
+  /// Caution: this function may hurt performance
+  /// if improper parameters provided.
+  ///
+  /// \param perf_hint  performance hint
+  /// \param priority_hint  priority hint
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetGPUHints(GPUPerfHint perf_hint,
+                         GPUPriorityHint priority_hint);
+  /// \brief Set CPU threads number and affinity policy.
+  ///
+  /// Caution: this function may hurt performance if improper
+  /// parameters provided. When num_threads_hint is zero or negative,
+  /// the function will set the threads number equaling to the number of
+  /// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
+  /// (AFFINITY_NONE) cores according to the policy. The threads number will
+  /// also be truncated to the corresponding cores number when num_threads_hint
+  /// is larger than it.
+  /// The OpenMP threads will be bind to (via sched_setaffinity) big cores
+  /// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
+  ///
+  /// \param num_threads_hint it is only a hint.
+  /// \param policy one of CPUAffinityPolicy
+  /// \param status MACE_SUCCESS for successful, or it can't reliabley
+  /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
+  /// suggested to use AFFINITY_NONE to use all cores.
+  /// \param use_gemmlowp use gemmlowp for quantized inference
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetCPUThreadPolicy(int num_threads_hint,
+                                CPUAffinityPolicy policy,
+                                bool use_gemmlowp = false);
+  /// \brief Set OpenMP threads number and processor affinity.
+  ///
+  /// Caution: this function may hurt performance
+  /// if improper parameters provided.
+  /// This function may not work well on some chips (e.g. MTK). Setting thread
+  /// affinity to offline cores may run very slow or unexpectedly.
+  /// In such cases, please use SetOpenMPThreadPolicy with default policy
+  /// instead.
+  ///
+  /// \param num_threads
+  /// \param cpu_ids
+  /// \return MACE_SUCCESS for success, other for failed.
+  MaceStatus SetOpenMPThreadAffinity(
+      int num_threads,
+      const std::vector<int> &cpu_ids);
+  DeviceType device_type() const;
+  int num_threads() const;
+  std::shared_ptr<GPUContext> gpu_context() const;
+  GPUPriorityHint gpu_priority_hint() const;
+  GPUPerfHint gpu_perf_hint() const;
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
 // MACE input/output tensor
-class __attribute__((visibility("default"))) MaceTensor {
+class MACE_API MaceTensor {
 public:
  // shape - the shape of the tensor, with size n
  // data - the buffer of the tensor, must not be null with size equals
  //        shape[0] * shape[1] * ... * shape[n-1]
-  explicit MaceTensor(const std::vector<int64_t> &shape,
+  MaceTensor(const std::vector<int64_t> &shape,
-                      std::shared_ptr<float> data);
+             std::shared_ptr<float> data);
  MaceTensor();
  MaceTensor(const MaceTensor &other);
  MaceTensor(const MaceTensor &&other);
@@ -97,9 +274,9 @@ class __attribute__((visibility("default"))) MaceTensor {
  std::unique_ptr<Impl> impl_;
 };
-class __attribute__((visibility("default"))) MaceEngine {
+class MACE_API MaceEngine {
 public:
-  explicit MaceEngine(DeviceType device_type);
+  explicit MaceEngine(const MaceEngineConfig &config);
  ~MaceEngine();
  MaceStatus Init(const NetDef *net_def,
@@ -135,18 +312,16 @@ class __attribute__((visibility("default"))) MaceEngine {
 /// \param model_data_file[in]: the path of model data file
 /// \param input_nodes[in]: the array of input nodes' name
 /// \param output_nodes[in]: the array of output nodes' name
-/// \param device_type[in]: one of [CPU, GPU, HEXAGON],
+/// \param config[in]: configurations for MaceEngine.
-///        based on the runtime type of your model deployment file.
 /// \param engine[out]: output MaceEngine object
 /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
 ///         MACE_OUT_OF_RESOURCES for resources is out of range.
-__attribute__((visibility("default")))
+MACE_API MaceStatus CreateMaceEngineFromProto(
-MaceStatus CreateMaceEngineFromProto(
    const std::vector<unsigned char> &model_pb,
    const std::string &model_data_file,
    const std::vector<std::string> &input_nodes,
    const std::vector<std::string> &output_nodes,
-    const DeviceType device_type,
+    const MaceEngineConfig &config,
    std::shared_ptr<MaceEngine> *engine);
 }  // namespace mace

--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// This file defines runtime tuning APIs.
-// These APIs are not stable.
-#ifndef MACE_PUBLIC_MACE_RUNTIME_H_
-#define MACE_PUBLIC_MACE_RUNTIME_H_
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include "mace/public/mace.h"
-namespace mace {
-enum GPUPerfHint {
-  PERF_DEFAULT = 0,
-  PERF_LOW = 1,
-  PERF_NORMAL = 2,
-  PERF_HIGH = 3
-};
-enum GPUPriorityHint {
-  PRIORITY_DEFAULT = 0,
-  PRIORITY_LOW = 1,
-  PRIORITY_NORMAL = 2,
-  PRIORITY_HIGH = 3
-};
-enum CPUAffinityPolicy {
-  AFFINITY_NONE = 0,
-  AFFINITY_BIG_ONLY = 1,
-  AFFINITY_LITTLE_ONLY = 2,
-};
-class KVStorage {
- public:
-  // return: 0 for success, -1 for error
-  virtual int Load() = 0;
-  virtual void Clear() = 0;
-  virtual bool Insert(const std::string &key,
-                      const std::vector<unsigned char> &value) = 0;
-  virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
-  // return: 0 for success, -1 for error
-  virtual int Flush() = 0;
-  virtual ~KVStorage() {}
-};
-class KVStorageFactory {
- public:
-  virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
-};
-class __attribute__((visibility("default"))) FileStorageFactory
-    : public KVStorageFactory {
- public:
-  // You have to make sure your APP have read and write permission of the path.
-  explicit FileStorageFactory(const std::string &path);
-  ~FileStorageFactory();
-  std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
- private:
-  class Impl;
-  std::unique_ptr<Impl> impl_;
-};
-/// \brief Set internal storage factory to store internal data. (Call once)
-///
-/// Now the path is used to store the built OpenCL binaries to file,
-/// which could speed up the GPU initialization and first run.
-/// If do not call this API, the initialization maybe slow for GPU.
-///
-/// \param path  Make sure your program have Read/Write permission of the path
-/// \return
-__attribute__((visibility("default")))
-void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
-/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)  // NOLINT(whitespace/line_length)
-///
-/// Just call once. (Not thread-safe)
-/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization.  // NOLINT(whitespace/line_length)
-/// OpenCL binary is corresponding to the OpenCL Driver version,
-/// you should update the binary when OpenCL Driver changed.
-///
-/// \param paths MACE will use first file found in all paths
-/// \return
-__attribute__((visibility("default")))
-void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
-/// \brief Set the path of Generated OpenCL parameter file
-///
-/// Just call once. (Not thread-safe)
-/// If you use gpu for specific soc, The parameters is the local work group
-/// size tuned for specific SOC, which may be faster than the
-/// general parameters.
-///
-/// \param path Make sure your program have Read/Write permission of the path
-/// \return
-__attribute__((visibility("default")))
-void SetOpenCLParameterPath(const std::string &path);
-/// \brief Set GPU hints, currently only supports Adreno GPU.
-///
-/// Caution: this function may hurt performance
-/// if improper parameters provided.
-///
-/// \param perf_hint  performance hint
-/// \param priority_hint  priority hint
-/// \return
-__attribute__((visibility("default")))
-void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
-/// \brief Set OpenMP threads number and affinity policy.
-///
-/// Caution: this function may hurt performance if improper parameters provided.
-/// When num_threads_hint is zero or negative,
-/// the function will set the threads number equaling to the number of
-/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
-/// (AFFINITY_NONE) cores according to the policy. The threads number will
-/// also be truncated to the corresponding cores number when num_threads_hint
-/// is larger than it.
-/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
-/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY).
-/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for
-/// quantized inference.
-///
-/// \param num_threads_hint it is only a hint.
-/// \param policy one of CPUAffinityPolicy
-/// \param use_gemmlowp use gemmlowp for quantized inference
-/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE
-/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use
-/// AFFINITY_NONE to use all cores.
-__attribute__((visibility("default")))
-MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
-                                 CPUAffinityPolicy policy,
-                                 bool use_gemmlowp = false);
-/// \brief Set OpenMP threads number and processor affinity.
-///
-/// Caution: this function may hurt performance
-/// if improper parameters provided.
-/// This function may not work well on some chips (e.g. MTK). Setting thread
-/// affinity to offline cores may run very slow or unexpectedly.
-/// In such cases, please use SetOpenMPThreadPolicy with default policy
-/// instead.
-///
-/// \param num_threads
-/// \param cpu_ids
-/// \return
-__attribute__((visibility("default")))
-MaceStatus SetOpenMPThreadAffinity(int num_threads,
-                                   const std::vector<int> &cpu_ids);
-/// \brief Get ARM big.LITTLE configuration.
-///
-/// This function will detect the max frequencies of all CPU cores, and assume
-/// the cores with largest max frequencies as big cores, and all the remaining
-/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
-/// little_core_ids will both be filled with all cpu core ids.
-///
-/// \param [out] big_core_ids
-/// \param [out] little_core_ids
-/// \return If successful, it returns MACE_SUCCESS and error if it can't
-///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
-__attribute__((visibility("default")))
-MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                               std::vector<int> *little_core_ids);
-}  // namespace mace
-#endif  // MACE_PUBLIC_MACE_RUNTIME_H_
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -20,7 +20,6 @@
 #include <vector>
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 namespace mace {
@@ -57,8 +56,7 @@ std::map<std::string, int> model_name_map {
 ///        if model_data_format is code, just pass empty string("")
 /// \param input_nodes[in]: the array of input nodes' name
 /// \param output_nodes[in]: the array of output nodes' name
-/// \param device_type[in]: one of [CPU, GPU, HEXAGON],
+/// \param config[in]: configurations for MaceEngine.
-///        based on the runtime type of your model deployment file.
 /// \param engine[out]: output MaceEngine object
 /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
 ///         MACE_OUT_OF_RESOURCES for resources is out of range.
@@ -67,7 +65,7 @@ MaceStatus CreateMaceEngineFromCode(
    const std::string &model_data_file,
    const std::vector<std::string> &input_nodes,
    const std::vector<std::string> &output_nodes,
-    const DeviceType device_type,
+    const MaceEngineConfig &config,
    std::shared_ptr<MaceEngine> *engine) {
  // load model
  if (engine == nullptr) {
@@ -83,7 +81,7 @@ MaceStatus CreateMaceEngineFromCode(
 {% for i in range(model_tags |length) %}
    case {{ i }}:
      net_def = mace::{{model_tags[i]}}::CreateNet();
-      engine->reset(new mace::MaceEngine(device_type));
+      engine->reset(new mace::MaceEngine(config));
 {% if embed_model_data %}
      model_data = mace::{{model_tags[i]}}::LoadModelData();
      status = (*engine)->Init(net_def.get(), input_nodes, output_nodes,

--- a/mace/test/BUILD
+++ b/mace/test/BUILD
-# Description:
-# Mace operators.
-#
 package(
    default_visibility = ["//visibility:public"],
 )

--- a/mace/test/mace_api_exception_test.cc
+++ b/mace/test/mace_api_exception_test.cc
@@ -23,7 +23,9 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
  input_names.push_back(MakeString("input", 0));
  output_names.push_back(MakeString("output", 0));
-  const DeviceType device = DeviceType::GPU;
+  MaceEngineConfig config(DeviceType::GPU);
+  config.SetGPUContext(
+      ops::test::OpTestContext::Get()->gpu_context());
  std::shared_ptr<NetDef> net_def(new NetDef());
  for (size_t i = 0; i < input_names.size(); ++i) {
@@ -31,7 +33,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
    info->set_name(input_names[i]);
  }
-  MaceEngine engine(device);
+  MaceEngine engine(config);
  ASSERT_DEATH(engine.Init(net_def.get(), {"input"}, output_names, nullptr),
               "");
 }

--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -18,7 +18,6 @@
 #include "mace/core/operator.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/public/mace_runtime.h"
 namespace mace {
 namespace test {
@@ -200,7 +199,7 @@ void CheckOutputs(const NetDef &net_def,
  for (auto output : outputs) {
    std::unique_ptr<Tensor> tmp_tensor(
-        new Tensor(GetDeviceAllocator(DeviceType::CPU),
+        new Tensor(GetCPUAllocator(),
                   DataTypeToEnum<float>::v()));
    auto output_shape = output.second.shape();
    const int64_t data_size = std::accumulate(output_shape.begin(),
@@ -333,13 +332,9 @@ void MaceRunFunc(const int in_out_size) {
    OutputInfo *info = net_def->add_output_info();
    info->set_name(output_names[i]);
  }
+  MaceEngineConfig config(DeviceType::GPU);
-  const std::string file_path ="/data/local/tmp/mace";
+  MaceEngine engine(config);
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(file_path));
-  mace::SetKVStorageFactory(storage_factory);
-  MaceEngine engine(device);
  MaceStatus status = engine.Init(net_def.get(), input_names, output_names,
      reinterpret_cast<unsigned char *>(data.data()));
  EXPECT_EQ(status, MaceStatus::MACE_SUCCESS);
@@ -367,7 +362,7 @@ TEST_F(MaceMTAPITest, MultipleThread) {
  const int thread_num = 10;
  std::vector<std::thread> threads;
  for (int i = 0; i < thread_num; ++i) {
-    threads.push_back(std::thread(MaceRunFunc, i));
+    threads.push_back(std::thread(MaceRunFunc, 1));
  }
  for (auto &t : threads) {
    t.join();

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -18,7 +18,7 @@
 #include "mace/core/operator.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/public/mace_runtime.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace test {
@@ -199,9 +199,10 @@ void CheckOutputs(const NetDef &net_def,
  }
  net.RunNet(net_def, D);
+  std::unique_ptr<Allocator> allocator(new CPUAllocator);
  for (auto output : outputs) {
    std::unique_ptr<Tensor> tmp_tensor(
-        new Tensor(GetDeviceAllocator(DeviceType::CPU),
+        new Tensor(allocator.get(),
                   DataTypeToEnum<float>::v()));
    auto output_shape = output.second.shape();
    const int64_t data_size = std::accumulate(output_shape.begin(),
@@ -333,7 +334,9 @@ void MaceRun(const int in_out_size,
    info->set_name(output_names[i]);
  }
-  MaceEngine engine(device);
+  MaceEngineConfig config(DeviceType::GPU);
+  MaceEngine engine(config);
  MaceStatus status = engine.Init(net_def.get(), input_names, output_names,
      reinterpret_cast<unsigned char *>(data.data()));
  EXPECT_EQ(status, MaceStatus::MACE_SUCCESS);

--- a/mace/tools/quantization/quantize_stat.cc
+++ b/mace/tools/quantization/quantize_stat.cc
@@ -33,7 +33,6 @@
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
@@ -122,8 +121,15 @@ bool RunModel(const std::string &model_name,
              const std::vector<std::vector<int64_t>> &input_shapes,
              const std::vector<std::string> &output_names,
              const std::vector<std::vector<int64_t>> &output_shapes) {
-  MACE_RETURN_IF_ERROR(mace::SetOpenMPThreadPolicy(
+  // config runtime
-      FLAGS_omp_num_threads, CPUAffinityPolicy::AFFINITY_NONE));
+  MaceStatus status;
+  MaceEngineConfig config(DeviceType::CPU);
+  status = config.SetCPUThreadPolicy(
+      FLAGS_omp_num_threads,
+      CPUAffinityPolicy::AFFINITY_NONE);
+  if (status != MACE_SUCCESS) {
+    LOG(WARNING) << "Set openmp or cpu affinity failed.";
+  }
  std::vector<unsigned char> model_pb_data;
  if (FLAGS_model_file != "") {
@@ -141,7 +147,7 @@ bool RunModel(const std::string &model_name,
                                 FLAGS_model_data_file,
                                 input_names,
                                 output_names,
-                                 DeviceType::CPU,
+                                 config,
                                 &engine));
 #else
  (void) (model_name);
@@ -150,7 +156,7 @@ bool RunModel(const std::string &model_name,
                                FLAGS_model_data_file,
                                input_names,
                                output_names,
-                                DeviceType::CPU,
+                                config,
                                &engine));
 #endif

--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -33,7 +33,6 @@
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
@@ -203,35 +202,37 @@ bool RunModel(const std::string &model_name,
              const std::vector<std::vector<int64_t>> &output_shapes) {
  DeviceType device_type = ParseDeviceType(FLAGS_device);
  // config runtime
-  MaceStatus status = mace::SetOpenMPThreadPolicy(
+  MaceStatus status;
-      FLAGS_omp_num_threads,
+  MaceEngineConfig config(device_type);
-      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
+  status = config.SetCPUThreadPolicy(
-      true);
+          FLAGS_omp_num_threads,
+          static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
+          true);
  if (status != MACE_SUCCESS) {
    LOG(WARNING) << "Set openmp or cpu affinity failed.";
  }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
  if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+    const std::string storage_path =
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
+        std::string(storage_path_ptr == nullptr ?
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
 #endif  // MACE_ENABLE_OPENCL
-  const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
-  const std::string kernel_file_path =
-      std::string(kernel_path == nullptr ?
-                  "/data/local/tmp/mace_run/interior" : kernel_path);
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(kernel_file_path));
-  SetKVStorageFactory(storage_factory);
  std::vector<unsigned char> model_pb_data;
  if (FLAGS_model_file != "") {
    if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) {
@@ -252,7 +253,7 @@ bool RunModel(const std::string &model_name,
                                   FLAGS_model_data_file,
                                   input_names,
                                   output_names,
-                                   device_type,
+                                   config,
                                   &engine);
 #else
    (void)(model_name);
@@ -261,7 +262,7 @@ bool RunModel(const std::string &model_name,
                                  FLAGS_model_data_file,
                                  input_names,
                                  output_names,
-                                  device_type,
+                                  config,
                                  &engine);
 #endif
    int64_t t1 = NowMicros();
@@ -326,7 +327,7 @@ bool RunModel(const std::string &model_name,
                                   FLAGS_model_data_file,
                                   input_names,
                                   output_names,
-                                   device_type,
+                                   config,
                                   &engine);
 #else
        create_engine_status =
@@ -334,7 +335,7 @@ bool RunModel(const std::string &model_name,
                                      FLAGS_model_data_file,
                                      input_names,
                                      output_names,
-                                      device_type,
+                                      config,
                                      &engine);
 #endif
      } while (create_engine_status != MACE_SUCCESS);
@@ -366,7 +367,7 @@ bool RunModel(const std::string &model_name,
                                       FLAGS_model_data_file,
                                       input_names,
                                       output_names,
-                                       device_type,
+                                       config,
                                       &engine);
 #else
            create_engine_status =
@@ -374,7 +375,7 @@ bool RunModel(const std::string &model_name,
                                          FLAGS_model_data_file,
                                          input_names,
                                          output_names,
-                                          device_type,
+                                          config,
                                          &engine);
 #endif
          } while (create_engine_status != MACE_SUCCESS);

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -15,6 +15,8 @@
 #ifndef MACE_UTILS_TUNER_H_
 #define MACE_UTILS_TUNER_H_
 #include <stdlib.h>
+#include <cstring>
 #include <fstream>
 #include <functional>
 #include <limits>
@@ -29,18 +31,24 @@
 namespace mace {
+inline bool IsTuning() {
+  const char *tuning = getenv("MACE_TUNING");
+  return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
+}
 template <typename param_type>
 class Tuner {
 public:
-  static Tuner *Get() {
+  explicit Tuner(const std::string tuned_param_file_path = ""):
-    static Tuner tuner;
+      tuned_param_file_path_(tuned_param_file_path) {
-    return &tuner;
+    path_ = getenv("MACE_RUN_PARAMETER_PATH");
+    ReadRunParamters();
  }
-  inline bool IsTuning() {
+  ~Tuner() { WriteRunParameters(); }
-    const char *tuning = getenv("MACE_TUNING");
-    return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
+  Tuner(const Tuner &) = delete;
-  }
+  Tuner &operator=(const Tuner &) = delete;
  template <typename RetType>
  RetType TuneOrRun(
@@ -76,16 +84,6 @@ class Tuner {
  }
 private:
-  Tuner() {
-    path_ = getenv("MACE_RUN_PARAMETER_PATH");
-    ReadRunParamters();
-  }
-  ~Tuner() { WriteRunParameters(); }
-  Tuner(const Tuner &) = delete;
-  Tuner &operator=(const Tuner &) = delete;
  inline void WriteRunParameters() {
    if (path_ != nullptr) {
      VLOG(3) << "Write tuning result to " << path_;
@@ -117,9 +115,9 @@ class Tuner {
  }
  inline void ReadRunParamters() {
-    extern std::string kOpenCLParameterPath;
+    if (!tuned_param_file_path_.empty()) {
-    if (!kOpenCLParameterPath.empty()) {
+      std::ifstream ifs(tuned_param_file_path_,
-      std::ifstream ifs(kOpenCLParameterPath, std::ios::binary | std::ios::in);
+                        std::ios::binary | std::ios::in);
      if (ifs.is_open()) {
        int64_t num_params = 0;
        ifs.read(reinterpret_cast<char *>(&num_params), sizeof(num_params));
@@ -144,7 +142,7 @@ class Tuner {
        LOG(WARNING) << "Read OpenCL tuned parameters file failed.";
      }
    } else {
-      LOG(INFO) << "There is no tuned parameters.";
+      VLOG(1) << "There is no tuned parameters.";
    }
  }
@@ -207,6 +205,7 @@ class Tuner {
  }
 private:
+  std::string tuned_param_file_path_;
  const char *path_;
  std::unordered_map<std::string, std::vector<param_type>> param_table_;
 };

--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
@@ -42,15 +42,16 @@ TEST_F(TunerTest, SimpleRun) {
    }
  };
+  Tuner<unsigned int> tuner;
  WallClockTimer timer;
  std::vector<unsigned int> default_params(1, 1);
-  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  int res = tuner.TuneOrRun<unsigned int>(
      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
  EXPECT_EQ(expect, res);
  default_params[0] = 2;
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  res = tuner.TuneOrRun<unsigned int>(
      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
  EXPECT_EQ(expect + 1, res);
 }
@@ -88,13 +89,14 @@ TEST_F(TunerTest, SimpleTune) {
    return {{1}, {2}, {3}, {4}};
  };
  // tune
+  Tuner<unsigned int> tuner;
  WallClockTimer timer;
-  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  int res = tuner.TuneOrRun<unsigned int>(
      "SimpleRun", default_params, *params_generator, TunerFunc, &timer);
  EXPECT_EQ(expect, res);
  // run
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+  res = tuner.template TuneOrRun<unsigned int>(
      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
  EXPECT_EQ(expect, res);
 }