Refactor configuration APIs and Remove some global static variables.

f23eb74b · liuqi · 379c730d · f23eb74b · f23eb74b · f23eb74b
205 changed file
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -22,7 +22,6 @@

 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
 #include "mace/benchmark/statistics.h"
@@ -257,36 +256,40 @@ int Main(int argc, char **argv) {

  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);

-  // config runtime
-  MaceStatus ret = mace::SetOpenMPThreadPolicy(
+  // configuration
+  MaceStatus mace_status;
+  MaceEngineConfig config(device_type);
+  mace_status = config.SetCPUThreadPolicy(
      FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
+      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
      true);
-  if (ret != MACE_SUCCESS) {
-    LOG(WARNING) << "Set openmp or cpu affinity failed.";
+  if (mace_status != MACE_SUCCESS) {
+    LOG(INFO) << "Set openmp or cpu affinity failed.";
  }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
  if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-
+    // DO NOT USE tmp directory.
+    // Please use APP's own directory and make sure the directory exists.
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
+    const std::string storage_path =
+        std::string(storage_path_ptr == nullptr ?
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);

-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
 #endif  // MACE_ENABLE_OPENCL

-  const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
-  const std::string kernel_file_path =
-      std::string(kernel_path == nullptr ?
-                  "/data/local/tmp/mace_run/interior" : kernel_path);
-
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(kernel_file_path));
-  SetKVStorageFactory(storage_factory);
-
  // Create Engine
  std::shared_ptr<mace::MaceEngine> engine;
  MaceStatus create_engine_status;
@@ -306,7 +309,7 @@ int Main(int argc, char **argv) {
                                 model_data_file_ptr,
                                 input_names,
                                 output_names,
-                                 device_type,
+                                 config,
                                 &engine);
 #else
  create_engine_status =
@@ -314,7 +317,7 @@ int Main(int argc, char **argv) {
                                model_data_file_ptr,
                                input_names,
                                output_names,
-                                device_type,
+                                config,
                                &engine);
 #endif
  if (create_engine_status != MaceStatus::MACE_SUCCESS) {

--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -13,30 +13,12 @@
 // limitations under the License.

 #include "mace/core/allocator.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_allocator.h"
-#endif

 namespace mace {

-std::map<int32_t, Allocator *> *gAllocatorRegistry() {
-  static std::map<int32_t, Allocator *> g_allocator_registry;
-  return &g_allocator_registry;
+Allocator *GetCPUAllocator() {
+  static CPUAllocator allocator;
+  return &allocator;
 }

-Allocator *GetDeviceAllocator(DeviceType type) {
-  auto iter = gAllocatorRegistry()->find(type);
-  if (iter == gAllocatorRegistry()->end()) {
-    LOG(ERROR) << "Allocator not found for device " << type;
-    return nullptr;
-  }
-  return iter->second;
-}
-
-MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
-#ifdef MACE_ENABLE_OPENCL
-MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
-#endif
-MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());
-
 }  // namespace mace
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -26,8 +26,6 @@
 #include "mace/core/registry.h"
 #include "mace/core/types.h"
 #include "mace/core/runtime_failure_mock.h"
-#include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"

 namespace mace {

@@ -138,26 +136,8 @@ class CPUAllocator : public Allocator {
  bool OnHost() const override { return true; }
 };

-std::map<int32_t, Allocator *> *gAllocatorRegistry();
-
-Allocator *GetDeviceAllocator(DeviceType type);
-
-struct AllocatorRegisterer {
-  explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) {
-    if (gAllocatorRegistry()->count(type)) {
-      LOG(ERROR) << "Allocator for device type " << type
-                 << " registered twice. This should not happen."
-                 << gAllocatorRegistry()->count(type);
-      std::exit(1);
-    }
-    gAllocatorRegistry()->emplace(type, alloc);
-  }
-};
-
-#define MACE_REGISTER_ALLOCATOR(type, alloc)                                  \
-  namespace {                                                                 \
-  static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \
-  }
+// Global CPU allocator used for CPU/GPU/DSP
+Allocator *GetCPUAllocator();

 }  // namespace mace


--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -20,7 +20,6 @@
 #include <vector>

 #include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"

 namespace mace {


--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -218,9 +218,9 @@ class Buffer : public BufferBase {

 class Image : public BufferBase {
 public:
-  Image()
+  explicit Image(Allocator *allocator)
      : BufferBase(0),
-        allocator_(GetDeviceAllocator(GPU)),
+        allocator_(allocator),
        buf_(nullptr),
        mapped_buf_(nullptr) {}


--- a/mace/core/device.cc
+++ b/mace/core/device.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/device.h"
+
+namespace mace {
+
+CPUDevice::CPUDevice(const int num_threads)
+    : cpu_runtime_(new CPURuntime(num_threads)) {}
+
+CPUDevice::~CPUDevice() = default;
+
+CPURuntime *CPUDevice::cpu_runtime() {
+  return cpu_runtime_.get();
+}
+
+#ifdef MACE_ENABLE_OPENCL
+OpenCLRuntime *CPUDevice::opencl_runtime() {
+  return nullptr;
+}
+#endif
+
+Allocator *CPUDevice::allocator() {
+  return GetCPUAllocator();
+}
+
+DeviceType CPUDevice::device_type() const {
+  return DeviceType::CPU;
+}
+
+}  // namespace mace
--- a/mace/core/device.h
+++ b/mace/core/device.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_DEVICE_H_
+#define MACE_CORE_DEVICE_H_
+
+#include <memory>
+
+#include "mace/core/runtime/cpu/cpu_runtime.h"
+#include "mace/core/allocator.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#endif
+
+namespace mace {
+
+class Device {
+ public:
+  virtual ~Device() {}
+
+#ifdef MACE_ENABLE_OPENCL
+  virtual OpenCLRuntime *opencl_runtime() = 0;
+#endif
+  virtual CPURuntime *cpu_runtime() = 0;
+
+  virtual Allocator *allocator() = 0;
+  virtual DeviceType device_type() const = 0;
+};
+
+class CPUDevice : public Device {
+ public:
+  explicit CPUDevice(const int num_threads);
+  virtual ~CPUDevice();
+
+#ifdef MACE_ENABLE_OPENCL
+  OpenCLRuntime *opencl_runtime() override;
+#endif
+  CPURuntime *cpu_runtime() override;
+
+  Allocator *allocator() override;
+  DeviceType device_type() const override;
+
+ private:
+  std::unique_ptr<CPURuntime> cpu_runtime_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_DEVICE_H_
--- a/mace/core/device_context.cc
+++ b/mace/core/device_context.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/device_context.h"
+
+#include <sys/stat.h>
+
+namespace mace {
+
+namespace {
+
+const char *kPrecompiledProgramFileName = "mace_cl_compiled_program.bin";
+
+std::string FindFirstExistPath(const std::vector<std::string> &paths) {
+  std::string result;
+  struct stat st;
+  for (auto path : paths) {
+    if (stat(path.c_str(), &st) == 0) {
+      if (S_ISREG(st.st_mode)) {
+        result = path;
+        break;
+      }
+    }
+  }
+  return result;
+}
+}  // namespace
+
+GPUContext::GPUContext(const std::string &storage_path,
+                       const std::vector<std::string> &opencl_binary_paths,
+                       const std::string &opencl_parameter_path)
+    : storage_factory_(new FileStorageFactory(storage_path)),
+      opencl_tuner_(new Tuner<uint32_t>(opencl_parameter_path)) {
+
+  if (!storage_path.empty()) {
+    opencl_cache_storage_ =
+        storage_factory_->CreateStorage(kPrecompiledProgramFileName);
+  }
+
+  std::string precompiled_binary_path =
+      FindFirstExistPath(opencl_binary_paths);
+  if (!precompiled_binary_path.empty()) {
+    opencl_binary_storage_.reset(
+        new FileStorage(precompiled_binary_path));
+  }
+}
+
+GPUContext::~GPUContext() = default;
+
+KVStorage *GPUContext::opencl_binary_storage() {
+  return opencl_binary_storage_.get();
+}
+
+KVStorage *GPUContext::opencl_cache_storage() {
+  return opencl_cache_storage_.get();
+}
+
+Tuner<uint32_t> *GPUContext::opencl_tuner() {
+  return opencl_tuner_.get();
+}
+
+}  // namespace mace
--- a/mace/core/device_context.h
+++ b/mace/core/device_context.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_DEVICE_CONTEXT_H_
+#define MACE_CORE_DEVICE_CONTEXT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/file_storage.h"
+#include "mace/utils/tuner.h"
+
+namespace mace {
+
+class GPUContext {
+ public:
+  GPUContext(const std::string &storage_path = "",
+             const std::vector<std::string> &opencl_binary_path = {},
+             const std::string &opencl_parameter_path = "");
+  ~GPUContext();
+
+  KVStorage *opencl_binary_storage();
+  KVStorage *opencl_cache_storage();
+  Tuner<uint32_t> *opencl_tuner();
+
+ private:
+  std::unique_ptr<KVStorageFactory> storage_factory_;
+  std::unique_ptr<Tuner<uint32_t>> opencl_tuner_;
+  std::unique_ptr<KVStorage> opencl_binary_storage_;
+  std::unique_ptr<KVStorage> opencl_cache_storage_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_DEVICE_CONTEXT_H_
--- a/mace/core/file_storage.cc
+++ b/mace/core/file_storage.cc
@@ -28,10 +28,36 @@

 namespace mace {

-std::shared_ptr<KVStorageFactory> kStorageFactory = nullptr;
+class FileStorageFactory::Impl {
+ public:
+  explicit Impl(const std::string &path);
+
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
+
+ private:
+  std::string path_;
+};
+
+FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
+
+std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
+    const std::string &name) {
+  return std::move(std::unique_ptr<KVStorage>(
+      new FileStorage(path_ + "/" + name)));
+}
+
+FileStorageFactory::FileStorageFactory(const std::string &path):
+    impl_(new FileStorageFactory::Impl(path)) {}
+
+FileStorageFactory::~FileStorageFactory() = default;
+
+std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
+    const std::string &name) {
+  return impl_->CreateStorage(name);
+}

 FileStorage::FileStorage(const std::string &file_path):
-    data_changed_(false), file_path_(file_path) {}
+    loaded_(false), data_changed_(false), file_path_(file_path) {}

 int FileStorage::Load() {
  struct stat st;
@@ -47,6 +73,9 @@ int FileStorage::Load() {
    }
  }
  utils::WriteLock lock(&data_mutex_);
+  if (loaded_) {
+    return 0;
+  }
  int fd = open(file_path_.c_str(), O_RDONLY);
  if (fd < 0) {
    if (errno == ENOENT) {
@@ -118,13 +147,17 @@ int FileStorage::Load() {
                 << " failed, error code: " << strerror(errno);
    return -1;
  }
+  loaded_ = true;
  return 0;
 }

-void FileStorage::Clear() {
+bool FileStorage::Clear() {
  utils::WriteLock lock(&data_mutex_);
-  data_.clear();
-  data_changed_ = true;
+  if (!data_.empty()) {
+    data_.clear();
+    data_changed_ = true;
+  }
+  return true;
 }

 bool FileStorage::Insert(const std::string &key,

--- a/mace/core/file_storage.h
+++ b/mace/core/file_storage.h
@@ -16,27 +16,64 @@
 #define MACE_CORE_FILE_STORAGE_H_

 #include <map>
+#include <memory>
 #include <string>
 #include <vector>

-#include "mace/public/mace_runtime.h"
+#include "mace/public/mace.h"
 #include "mace/utils/rwlock.h"

 namespace mace {

+class KVStorage {
+ public:
+  // return: 0 for success, -1 for error
+  virtual int Load() = 0;
+  virtual bool Clear() = 0;
+  // insert or update the key-value.
+  virtual bool Insert(const std::string &key,
+                      const std::vector<unsigned char> &value) = 0;
+  virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
+  // return: 0 for success, -1 for error
+  virtual int Flush() = 0;
+  virtual ~KVStorage() {}
+};
+
+class KVStorageFactory {
+ public:
+  virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
+
+  virtual ~KVStorageFactory() {}
+};
+
+class FileStorageFactory : public KVStorageFactory {
+ public:
+  // You have to make sure your APP have read and write permission of the path.
+  explicit FileStorageFactory(const std::string &path);
+
+  ~FileStorageFactory();
+
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
 class FileStorage : public KVStorage {
 public:
  explicit FileStorage(const std::string &file_path);

 public:
  int Load() override;
-  void Clear() override;
+  bool Clear() override;
  bool Insert(const std::string &key,
              const std::vector<unsigned char> &value) override;
  const std::vector<unsigned char> *Find(const std::string &key) override;
  int Flush() override;

 private:
+  bool loaded_;
  bool data_changed_;
  std::string file_path_;
  std::map<std::string, std::vector<unsigned char>> data_;

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -18,6 +18,7 @@

 #include "mace/core/macros.h"
 #include "mace/core/net.h"
+#include "mace/public/mace.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"
 #include "mace/utils/utils.h"
@@ -27,30 +28,35 @@ namespace mace {
 NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
                 const std::shared_ptr<const NetDef> net_def,
                 Workspace *ws,
-                 DeviceType type)
+                 Device *device)
    : name_(net_def->name()), op_registry_(op_registry) {
  MACE_UNUSED(ws);
-  MACE_UNUSED(type);
+  MACE_UNUSED(device);
 }

 SerialNet::SerialNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const std::shared_ptr<const NetDef> net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode)
-    : NetBase(op_registry, net_def, ws, type), device_type_(type) {
+    : NetBase(op_registry, net_def, ws, device), device_(device),
+      op_kernel_context_(new OpKernelContext(ws, device)) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
+  DeviceType device_type = device->device_type();
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
    // TODO(liuqi): refactor to add device_type to OperatorDef
    const int op_device =
        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(device_type_));
-    if (op_device == type) {
+            operator_def, "device", static_cast<int>(device_type));
+    if (op_device == device_type) {
+      VLOG(3) << "Creating operator " << operator_def.name() << "("
+              << operator_def.type() << ")";
      OperatorDef temp_def(operator_def);
      std::unique_ptr<OperatorBase> op(
-          op_registry->CreateOperator(temp_def, ws, type, mode));
+          op_registry->CreateOperator(temp_def, op_kernel_context_.get(),
+                                      device_type, mode));
      if (op) {
        operators_.emplace_back(std::move(op));
      }
@@ -61,13 +67,14 @@ SerialNet::SerialNet(
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
+  const DeviceType device_type = device_->device_type();
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
                        op->debug_def().type(), "), mem_id: ",
                        MakeListString(op->debug_def().mem_id().data(),
                                       op->debug_def().mem_id().size()));
-    bool future_wait = (device_type_ == DeviceType::GPU &&
+    bool future_wait = (device_type == DeviceType::GPU &&
                        (run_metadata != nullptr ||
                         std::distance(iter, operators_.end()) == 1));

@@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
      } else {
        future.wait_fn(nullptr);
      }
+#ifdef MACE_ENABLE_OPENCL
+      device_->opencl_runtime()->command_queue().finish();
+#endif
    } else if (run_metadata != nullptr) {
      call_stats.start_micros = NowMicros();
      MACE_RETURN_IF_ERROR(op->Run(nullptr));
@@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
    VLOG(3) << "Operator " << op->debug_def().name()
            << " has shape: " << MakeString(op->Output(0)->shape());

-    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type_ == CPU) {
+    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) {
      for (int i = 0; i < op->OutputSize(); ++i) {
        int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
        if (data_type == static_cast<int>(DT_FLOAT)) {
@@ -151,20 +161,20 @@ std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const NetDef &net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode) {
  std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
-  return CreateNet(op_registry, tmp_net_def, ws, type, mode);
+  return CreateNet(op_registry, tmp_net_def, ws, device, mode);
 }

 std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const std::shared_ptr<const NetDef> net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode) {
  std::unique_ptr<NetBase> net(
-      new SerialNet(op_registry, net_def, ws, type, mode));
+      new SerialNet(op_registry, net_def, ws, device, mode));
  return net;
 }


--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -20,7 +20,6 @@
 #include <vector>

 #include "mace/core/operator.h"
-#include "mace/public/mace.h"

 namespace mace {

@@ -33,7 +32,7 @@ class NetBase {
  NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
          const std::shared_ptr<const NetDef> net_def,
          Workspace *ws,
-          DeviceType type);
+          Device *device);
  virtual ~NetBase() noexcept {}

  virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;
@@ -52,14 +51,15 @@ class SerialNet : public NetBase {
  SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry,
            const std::shared_ptr<const NetDef> net_def,
            Workspace *ws,
-            DeviceType type,
+            Device *device,
            const NetMode mode = NetMode::NORMAL);

  MaceStatus Run(RunMetadata *run_metadata = nullptr) override;

 protected:
  std::vector<std::unique_ptr<OperatorBase> > operators_;
-  DeviceType device_type_;
+  Device *device_;
+  std::unique_ptr<OpKernelContext> op_kernel_context_;

  MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
 };
@@ -68,13 +68,13 @@ std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const NetDef &net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode = NetMode::NORMAL);
 std::unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const OperatorRegistryBase> op_registry,
    const std::shared_ptr<const NetDef> net_def,
    Workspace *ws,
-    DeviceType type,
+    Device *device,
    const NetMode mode = NetMode::NORMAL);

 }  // namespace mace

--- a/mace/core/op_kernel_context.cc
+++ b/mace/core/op_kernel_context.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/op_kernel_context.h"
+
+namespace mace {
+
+OpKernelContext::OpKernelContext(Workspace *ws, Device *device)
+    : device_(device), ws_(ws) {}
+
+OpKernelContext::~OpKernelContext() = default;
+
+Device* OpKernelContext::device() {
+  return device_;
+}
+
+Workspace* OpKernelContext::workspace() {
+  return ws_;
+}
+
+}  // namespace mace
--- a/mace/core/op_kernel_context.h
+++ b/mace/core/op_kernel_context.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_
+#define MACE_CORE_OP_KERNEL_CONTEXT_H_
+
+#include "mace/core/device.h"
+#include "mace/core/workspace.h"
+namespace mace {
+
+class OpKernelContext {
+ public:
+  OpKernelContext(Workspace *ws, Device *device);
+  ~OpKernelContext();
+  Device *device();
+  Workspace *workspace();
+ private:
+  Device *device_;
+  Workspace *ws_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_OP_KERNEL_CONTEXT_H_
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -18,12 +18,15 @@
 #include <vector>

 #include "mace/core/operator.h"
+#include "mace/core/op_kernel_context.h"

 namespace mace {

-OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
-    : operator_ws_(ws),
-      operator_def_(std::make_shared<OperatorDef>(operator_def)) {}
+OperatorBase::OperatorBase(const OperatorDef &operator_def,
+                           OpKernelContext *context)
+    : operator_def_(std::make_shared<OperatorDef>(operator_def)) {
+  MACE_UNUSED(context);
+}

 OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}

@@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {}

 std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
    const OperatorDef &operator_def,
-    Workspace *ws,
+    OpKernelContext *context,
    DeviceType type,
    const NetMode mode) const {
  const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
@@ -70,7 +73,7 @@ std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
            .Device(type)
            .TypeConstraint("T", static_cast<DataType>(dtype))
            .Build(),
-        operator_def, ws);
+        operator_def, context);
  } else {
    return nullptr;
  }

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -22,17 +22,17 @@

 #include "mace/core/arg_helper.h"
 #include "mace/core/future.h"
+#include "mace/core/op_kernel_context.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"

 namespace mace {

 class OperatorBase {
 public:
-  explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws);
+  explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *);
  virtual ~OperatorBase() noexcept {}

  template <typename T>
@@ -78,7 +78,6 @@ class OperatorBase {
  inline bool has_debug_def() const { return operator_def_ != nullptr; }

 protected:
-  Workspace *operator_ws_;
  std::shared_ptr<const OperatorDef> operator_def_;
  std::vector<const Tensor *> inputs_;
  std::vector<Tensor *> outputs_;
@@ -89,8 +88,9 @@ class OperatorBase {
 template <DeviceType D, class T>
 class Operator : public OperatorBase {
 public:
-  explicit Operator(const OperatorDef &operator_def, Workspace *ws)
-      : OperatorBase(operator_def, ws) {
+  explicit Operator(const OperatorDef &operator_def, OpKernelContext *context)
+      : OperatorBase(operator_def, context) {
+    Workspace *ws = context->workspace();
    for (const std::string &input_str : operator_def.input()) {
      const Tensor *tensor = ws->GetTensor(input_str);
      MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
@@ -116,7 +116,7 @@ class Operator : public OperatorBase {
          output_type = DataTypeToEnum<T>::v();
        }
        outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
-          output_str, GetDeviceAllocator(D), output_type)));
+          output_str, context->device()->allocator(), output_type)));
      }
    }
  }
@@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {

 class OperatorRegistryBase {
 public:
-  typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
+  typedef Registry<std::string,
+                   OperatorBase,
+                   const OperatorDef &,
+                   OpKernelContext *>
      RegistryType;
  OperatorRegistryBase() = default;
  virtual ~OperatorRegistryBase();
  RegistryType *registry() { return &registry_; }
  std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
-                                               Workspace *ws,
+                                               OpKernelContext *context,
                                               DeviceType type,
                                               const NetMode mode) const;

@@ -183,7 +186,7 @@ class OperatorRegistryBase {
 MACE_DECLARE_REGISTRY(OpRegistry,
                      OperatorBase,
                      const OperatorDef &,
-                      Workspace *);
+                      OpKernelContext *);

 #define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
  MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)

--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -22,7 +22,6 @@
 #include <string>
 #include <vector>

-#include "mace/public/mace.h"
 #include "mace/utils/logging.h"

 namespace mace {

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -30,7 +30,6 @@
 #include "public/gemmlowp.h"
 #include "mace/core/macros.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"

 namespace mace {

--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -18,7 +18,6 @@
 #include <vector>

 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"

 namespace mace {

@@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
                                             CPUAffinityPolicy policy,
                                             bool use_gemmlowp = false);

+class CPURuntime {
+ public:
+  explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
+  ~CPURuntime() = default;
+  inline int num_threads() const {
+    return num_threads_;
+  }
+ private:
+  int num_threads_;
+};
 }  // namespace mace

 #endif  // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/gpu_device.h"
+
+namespace mace {
+
+GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
+                     KVStorage *opencl_cache_storage,
+                     const GPUPriorityHint priority,
+                     const GPUPerfHint perf,
+                     KVStorage *opencl_binary_storage,
+                     const int num_threads) :
+    CPUDevice(num_threads),
+    runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
+                               opencl_binary_storage, tuner)),
+    allocator_(new OpenCLAllocator(runtime_.get())) {}
+
+GPUDevice::~GPUDevice() = default;
+
+OpenCLRuntime* GPUDevice::opencl_runtime() {
+  return runtime_.get();
+}
+
+Allocator* GPUDevice::allocator() {
+  return allocator_.get();
+}
+
+DeviceType GPUDevice::device_type() const {
+  return DeviceType::GPU;
+}
+
+}  // namespace mace
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
+#define MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
+
+#include <memory>
+
+#include "mace/core/device_context.h"
+#include "mace/core/device.h"
+#include "mace/core/runtime/opencl/opencl_allocator.h"
+
+namespace mace {
+
+class GPUDevice : public CPUDevice {
+ public:
+  GPUDevice(Tuner<uint32_t> *tuner,
+            KVStorage *opencl_cache_storage = nullptr,
+            const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
+            const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
+            KVStorage *opencl_binary_storage = nullptr,
+            const int num_threads = -1);
+  ~GPUDevice();
+  OpenCLRuntime *opencl_runtime() override;
+  Allocator *allocator() override;
+  DeviceType device_type() const override;
+ private:
+  std::unique_ptr<OpenCLRuntime> runtime_;
+  std::unique_ptr<OpenCLAllocator> allocator_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <memory>
+
 #include "mace/core/runtime/opencl/opencl_allocator.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"

 namespace mace {
@@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 }
 }  // namespace

-OpenCLAllocator::OpenCLAllocator() {}
+OpenCLAllocator::OpenCLAllocator(
+    OpenCLRuntime *opencl_runtime):
+    opencl_runtime_(opencl_runtime) {}

 OpenCLAllocator::~OpenCLAllocator() {}
 MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
@@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
  }

  cl_int error;
-  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
+  cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(),
                                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                      nbytes, nullptr, &error);
  if (error != CL_SUCCESS) {
@@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,

  cl_int error;
  cl::Image2D *cl_image =
-      new cl::Image2D(OpenCLRuntime::Global()->context(),
+      new cl::Image2D(opencl_runtime_->context(),
                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
                      image_shape[0], image_shape[1], 0, nullptr, &error);
  if (error != CL_SUCCESS) {
@@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
 }

 void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
+  VLOG(3) << "Map OpenCL buffer";
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Global()->command_queue();
+  auto queue = opencl_runtime_->command_queue();
  // TODO(heliangliang) Non-blocking call
  cl_int error;
  void *mapped_ptr =
@@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
 void *OpenCLAllocator::MapImage(void *buffer,
                                const std::vector<size_t> &image_shape,
                                std::vector<size_t> *mapped_image_pitch) const {
-  MACE_CHECK(image_shape.size() == 2, "Just support map 2d image");
+  VLOG(3) << "Map OpenCL Image";
+  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
  auto cl_image = static_cast<cl::Image2D *>(buffer);
  std::array<size_t, 3> origin = {0, 0, 0};
  std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};

  mapped_image_pitch->resize(2);
  cl_int error;
-  void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
+  void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage(
      *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
      mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
      nullptr, &error);
@@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer,
 }

 void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
+  VLOG(3) << "Unmap OpenCL buffer/Image";
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Global()->command_queue();
+  auto queue = opencl_runtime_->command_queue();
  cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
                                             nullptr, nullptr);
  if (error != CL_SUCCESS) {

--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -15,15 +15,17 @@
 #ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
 #define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_

+#include <memory>
 #include <vector>

 #include "mace/core/allocator.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"

 namespace mace {

 class OpenCLAllocator : public Allocator {
 public:
-  OpenCLAllocator();
+  explicit OpenCLAllocator(OpenCLRuntime *opencl_runtime);

  ~OpenCLAllocator() override;

@@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator {
  void Unmap(void *buffer, void *mapped_ptr) const override;

  bool OnHost() const override;
+
+ private:
+  OpenCLRuntime *opencl_runtime_;
 };

 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -24,11 +24,9 @@
 #include <vector>
 #include <utility>

-#include "mace/public/mace_runtime.h"
 #include "mace/core/macros.h"
 #include "mace/core/file_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
-#include "mace/public/mace.h"
 #include "mace/utils/tuner.h"

 namespace mace {
@@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector<std::string> &paths) {

 const char *kOpenCLPlatformInfoKey =
    "mace_opencl_precompiled_platform_info_key";
-const char *kPrecompiledProgramFileName =
-    "mace_cl_compiled_program.bin";
 }  // namespace

 void OpenCLProfilingTimer::StartTiming() {}

 void OpenCLProfilingTimer::StopTiming() {
-  OpenCLRuntime::Global()->command_queue().finish();
+  runtime_->command_queue().finish();
  start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
  stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
 }
@@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() {
  accumulated_micros_ = 0;
 }

-GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL;
-GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
-    GPUPriorityHint::PRIORITY_DEFAULT;
-std::string
-    OpenCLRuntime::kPrecompiledBinaryPath = "";  // NOLINT(runtime/string)
-
-OpenCLRuntime *OpenCLRuntime::Global() {
-  static OpenCLRuntime runtime;
-  return &runtime;
-}
-
-void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
-                              GPUPriorityHint gpu_priority_hint) {
-  OpenCLRuntime::kGPUPerfHint = gpu_perf_hint;
-  OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
-}
-
-void OpenCLRuntime::ConfigureOpenCLBinaryPath(
-    const std::vector<std::string> &paths) {
-  OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths);
-  if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
-    LOG(WARNING) << "There is no precompiled OpenCL binary file in "
-                 << MakeString(paths);
-  }
-}
-
-OpenCLRuntime::OpenCLRuntime():
-    precompiled_binary_storage_(nullptr),
-    cache_storage_(nullptr),
+OpenCLRuntime::OpenCLRuntime(
+    KVStorage *cache_storage,
+    const GPUPriorityHint priority_hint,
+    const GPUPerfHint perf_hint,
+    KVStorage *precompiled_binary_storage,
+    Tuner<uint32_t> *tuner):
+    cache_storage_(cache_storage),
+    precompiled_binary_storage_(precompiled_binary_storage),
+    tuner_(tuner),
    is_opencl_avaliable_(false),
    is_profiling_enabled_(false),
    opencl_version_(CL_VER_UNKNOWN),
@@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime():
  cl_command_queue_properties properties = 0;

  const char *profiling = getenv("MACE_OPENCL_PROFILING");
-  if (Tuner<uint32_t>::Get()->IsTuning() ||
+  if (IsTuning() ||
      (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
    properties |= CL_QUEUE_PROFILING_ENABLE;
    is_profiling_enabled_ = true;
@@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime():
    std::vector<cl_context_properties> context_properties;
    context_properties.reserve(5);
    GetAdrenoContextProperties(&context_properties,
-                               OpenCLRuntime::kGPUPerfHint,
-                               OpenCLRuntime::kGPUPriorityHint);
+                               perf_hint,
+                               priority_hint);
    context_ = std::shared_ptr<cl::Context>(
        new cl::Context({*device_}, context_properties.data(),
                        nullptr, nullptr, &err));
@@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime():
    return;
  }

-  extern std::shared_ptr<KVStorageFactory> kStorageFactory;
  std::string cached_binary_platform_info;
-  if (kStorageFactory != nullptr) {
-    cache_storage_ =
-        kStorageFactory->CreateStorage(kPrecompiledProgramFileName);
-
+  if (cache_storage_ != nullptr) {
    if (cache_storage_->Load() != 0) {
      LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. "
                   << "Please make sure the storage directory exist "
@@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime():
  }

  if (cached_binary_platform_info != platform_info_) {
-    if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
-      precompiled_binary_storage_.reset(
-          new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
+    if (precompiled_binary_storage_ == nullptr) {
+      VLOG(1) << "There is no precompiled OpenCL binary in"
+          " all OpenCL binary paths.";
+    } else {
      if (precompiled_binary_storage_->Load() != 0) {
        LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
                     << "Please make sure the storage directory exist "
@@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; }

 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }

+Tuner<uint32_t> *OpenCLRuntime::tuner() { return tuner_; }
+
 uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
  return device_gloabl_mem_cache_size_;
 }

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -22,11 +22,12 @@
 #include <string>
 #include <vector>

+#include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
+#include "mace/utils/tuner.h"

 namespace mace {

@@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error);
    return MaceStatus::MACE_OUT_OF_RESOURCES;               \
  }

-class OpenCLProfilingTimer : public Timer {
- public:
-  explicit OpenCLProfilingTimer(const cl::Event *event)
-      : event_(event), accumulated_micros_(0) {}
-  void StartTiming() override;
-  void StopTiming() override;
-  void AccumulateTiming() override;
-  void ClearTiming() override;
-  double ElapsedMicros() override;
-  double AccumulatedMicros() override;
-
- private:
-  const cl::Event *event_;
-  double start_nanos_;
-  double stop_nanos_;
-  double accumulated_micros_;
-};
-
 class OpenCLRuntime {
 public:
-  static OpenCLRuntime *Global();
-  static void Configure(GPUPerfHint, GPUPriorityHint);
-  static void ConfigureOpenCLBinaryPath(const std::vector<std::string> &paths);
+  OpenCLRuntime(
+      KVStorage *cache_storage = nullptr,
+      const GPUPriorityHint priority_hint = GPUPriorityHint::PRIORITY_NORMAL,
+      const GPUPerfHint perf_hint = GPUPerfHint::PERF_NORMAL,
+      KVStorage *precompiled_binary_storage = nullptr,
+      Tuner<uint32_t> *tuner = nullptr);
+  ~OpenCLRuntime();
+  OpenCLRuntime(const OpenCLRuntime &) = delete;
+  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;

  cl::Context &context();
  cl::Device &device();
@@ -91,6 +80,7 @@ class OpenCLRuntime {
  const std::string platform_info() const;
  uint64_t device_global_mem_cache_size() const;
  uint32_t device_compute_units() const;
+  Tuner<uint32_t> *tuner();
  bool is_opencl_avaliable();

  void GetCallStats(const cl::Event &event, CallStats *stats);
@@ -112,11 +102,6 @@ class OpenCLRuntime {
  void SaveBuiltCLProgram();

 private:
-  OpenCLRuntime();
-  ~OpenCLRuntime();
-  OpenCLRuntime(const OpenCLRuntime &) = delete;
-  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
-
  bool BuildProgram(const std::string &program_file_name,
                    const std::string &binary_file_name,
                    const std::string &build_options,
@@ -137,10 +122,13 @@ class OpenCLRuntime {
  OpenCLVersion ParseDeviceVersion(const std::string &device_version);

 private:
-  std::unique_ptr<KVStorage> precompiled_binary_storage_;
-  std::unique_ptr<KVStorage> cache_storage_;
+  KVStorage *cache_storage_;
+  KVStorage *precompiled_binary_storage_;
+  Tuner<uint32_t> *tuner_;
  bool is_opencl_avaliable_;
  bool is_profiling_enabled_;
+  OpenCLVersion opencl_version_;
+  GPUType gpu_type_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.
  std::shared_ptr<cl::Context> context_;
@@ -149,18 +137,30 @@ class OpenCLRuntime {
  std::map<std::string, cl::Program> built_program_map_;
  std::mutex program_build_mutex_;
  std::string platform_info_;
-  OpenCLVersion opencl_version_;
  std::string precompiled_binary_platform_info_;
  bool out_of_range_check_;
  uint64_t device_gloabl_mem_cache_size_;
  uint32_t device_compute_units_;
-  GPUType gpu_type_;
-
-  static GPUPerfHint kGPUPerfHint;
-  static GPUPriorityHint kGPUPriorityHint;
-  static std::string kPrecompiledBinaryPath;
 };

+class OpenCLProfilingTimer : public Timer {
+ public:
+  OpenCLProfilingTimer(OpenCLRuntime *runtime, const cl::Event *event)
+      : runtime_(runtime), event_(event), accumulated_micros_(0) {}
+  void StartTiming() override;
+  void StopTiming() override;
+  void AccumulateTiming() override;
+  void ClearTiming() override;
+  double ElapsedMicros() override;
+  double AccumulatedMicros() override;
+
+ private:
+  OpenCLRuntime *runtime_;
+  const cl::Event *event_;
+  double start_nanos_;
+  double stop_nanos_;
+  double accumulated_micros_;
+};
 }  // namespace mace

 #endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -25,7 +25,6 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
 #endif
-#include "mace/public/mace.h"
 #include "mace/utils/logging.h"

 #ifdef MACE_ENABLE_NEON
@@ -38,10 +37,10 @@
 namespace mace {

 #define MACE_SINGLE_ARG(...) __VA_ARGS__
-#define MACE_CASE(TYPE, STATEMENTS)             \
+#define MACE_CASE(TYPE, STATEMENTS)   \
  case DataTypeToEnum<TYPE>::value: { \
    typedef TYPE T;                   \
-    STATEMENTS;                            \
+    STATEMENTS;                       \
    break;                            \
  }

@@ -137,7 +136,7 @@ class Tensor {
    buffer_ = &buffer_slice_;
  }

-  Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}
+  Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}

  ~Tensor() {
    if (is_buffer_owner_ && buffer_ != nullptr) {
@@ -270,7 +269,7 @@ class Tensor {
    image_shape_ = image_shape;
    if (buffer_ == nullptr) {
      MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Image();
+      buffer_ = new Image(allocator_);
      return buffer_->Allocate(image_shape, dtype_);
    } else {
      MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");

--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -16,15 +16,10 @@

 #include "gflags/gflags.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"

 DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
-DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
 DEFINE_int32(cpu_affinity_policy, 1,
             "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
@@ -43,10 +38,6 @@ int main(int argc, char **argv) {
    LOG(WARNING) << "Set openmp or cpu affinity failed.";
  }

-  mace::OpenCLRuntime::Configure(
-      static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint),
-      static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint));
-
  mace::testing::Benchmark::Run(FLAGS_filter.c_str());
  return 0;
 }
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "mace/core/workspace.h"
+
+#include <memory>
 #include <string>
 #include <vector>
 #include <unordered_set>
@@ -21,8 +24,6 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif
-#include "mace/core/workspace.h"
-#include "mace/utils/timer.h"

 namespace mace {

@@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
 }
 }  // namespace

-Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer(
-  GetDeviceAllocator(DeviceType::CPU))) {}
+Workspace::Workspace() :
+    host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}

 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
@@ -74,7 +75,7 @@ std::vector<std::string> Workspace::Tensors() const {
 }

 MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
-                                      DeviceType type,
+                                      Device *device,
                                      const unsigned char *model_data) {
  MACE_LATENCY_LOGGER(1, "Load model tensors");
  index_t model_data_size = 0;
@@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
  }
  VLOG(3) << "Model data size: " << model_data_size;

+  const DeviceType device_type = device->device_type();
+
  if (model_data_size > 0) {
 #ifdef MACE_ENABLE_OPENCL
-    if (type == DeviceType::GPU &&
-        OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
+    if (device_type == DeviceType::GPU &&
+        device->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
            static_cast<uint64_t>(model_data_size)) {
      for (auto &const_tensor : net_def.tensors()) {
        MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
@@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        }

        std::unique_ptr<Tensor> tensor(
-            new Tensor(GetDeviceAllocator(type),
+            new Tensor(device->allocator(),
                       const_tensor.data_type(), true));
        tensor->Resize(dims);

@@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 #else
    {
 #endif
-      if (type == DeviceType::CPU) {
+      if (device_type == DeviceType::CPU) {
        tensor_buffer_ = std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(type),
+            new Buffer(device->allocator(),
                       const_cast<unsigned char*>(model_data),
                       model_data_size));
      } else {
        tensor_buffer_ = std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(type)));
+            new Buffer(device->allocator()));
        MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
        tensor_buffer_->Map(nullptr);
        tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
@@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
    }
  }

-  if (type == DeviceType::CPU || type == DeviceType::GPU) {
-    MaceStatus status = CreateOutputTensorBuffer(net_def, type);
+  if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
+    MaceStatus status = CreateOutputTensorBuffer(net_def, device);
    if (status != MaceStatus::MACE_SUCCESS) return status;
  }

-  if (type == DeviceType::CPU && net_def.has_quantize_info()) {
+  if (device_type == DeviceType::CPU && net_def.has_quantize_info()) {
    for (const auto
          &activation_info: net_def.quantize_info().activation_info()) {
      if (HasTensor(activation_info.tensor_name())) {
@@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 }

 MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                               DeviceType device_type) {
+                                               Device *device) {
+  DeviceType device_type = device->device_type();
  DataType dtype = DataType::DT_INVALID;
  if (net_def.mem_arena().mem_block_size() > 0) {
    // We use the data type of the first op with mem id,
@@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
              << ", memory type: " << mem_block.mem_type();
      if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(DeviceType::CPU)));
+            new Buffer(GetCPUAllocator()));
        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
            mem_block.x() * GetEnumTypeSize(dtype)
                + MACE_EXTRA_BUFFER_PAD_SIZE));
@@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                                          std::move(tensor_buf));
      } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
        std::unique_ptr<BufferBase> image_buf(
-            new Image());
+            new Image(device->allocator()));
        MACE_RETURN_IF_ERROR(image_buf->Allocate(
            {mem_block.x(), mem_block.y()}, dtype));
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                          std::move(image_buf));
      } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU)));
+            new Buffer(device->allocator()));
        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
            mem_block.x() * GetEnumTypeSize(dtype)));
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
@@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                op, "T", static_cast<int>(DT_FLOAT)));
          }
          CreateTensor(op.output(i),
-                       GetDeviceAllocator(device_type),
+                       device->allocator(),
                       output_type);
        }
      }
@@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() {
 }

 void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
-                                      const unsigned char *model_data) {
+                                      const unsigned char *model_data,
+                                      Allocator *alloc) {
  for (auto &const_tensor : net_def.tensors()) {
    auto iter = tensor_map_.find(const_tensor.name());
    if (iter->second->unused()) {
@@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
        dims.push_back(d);
      }
      std::unique_ptr<Tensor> tensor(
-          new Tensor(GetDeviceAllocator(DeviceType::GPU),
-                     const_tensor.data_type()));
+          new Tensor(alloc, const_tensor.data_type()));
      tensor->Resize(dims);
      MACE_CHECK(tensor->size() == const_tensor.data_size(),
                 "Tensor's data_size not equal with the shape");

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <memory>

+#include "mace/core/device.h"
 #include "mace/core/preallocated_pooled_allocator.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
@@ -48,7 +49,7 @@ class Workspace {
  std::vector<std::string> Tensors() const;

  MaceStatus LoadModelTensor(const NetDef &net_def,
-                             DeviceType type,
+                             Device *device,
                             const unsigned char *model_data);

  ScratchBuffer *GetScratchBuffer(DeviceType device_type);
@@ -56,11 +57,14 @@ class Workspace {
  void RemoveUnusedBuffer();

  void RemoveAndReloadBuffer(const NetDef &net_def,
-                             const unsigned char *model_data);
+                             const unsigned char *model_data,
+                             Allocator *alloc);

 private:
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
-                                      DeviceType device_type);
+                                      Device *device);
+
+  Device *device_;

  TensorMap tensor_map_;


--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java
@@ -37,15 +37,13 @@ public class AppModel {
        mJniThread = new Handler(thread.getLooper());
    }

-    public void maceMobilenetSetAttrs(final InitData initData) {
+    public void maceMobilenetCreateGPUContext(final InitData initData) {
        mJniThread.post(new Runnable() {
            @Override
            public void run() {
-                int result = JniMaceUtils.maceMobilenetSetAttrs(
-                        initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
-                        initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
-                        initData.getKernelPath());
-                Log.i("APPModel", "maceMobilenetSetAttrs result = " + result);
+                int result = JniMaceUtils.maceMobilenetCreateGPUContext(
+                        initData.getStoragePath());
+                Log.i("APPModel", "maceMobilenetCreateGPUContext result = " + result);
            }
        });
    }
@@ -54,7 +52,10 @@ public class AppModel {
        mJniThread.post(new Runnable() {
            @Override
            public void run() {
-                int result = JniMaceUtils.maceMobilenetCreateEngine(initData.getModel(), initData.getDevice());
+                int result = JniMaceUtils.maceMobilenetCreateEngine(
+                        initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
+                        initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
+                        initData.getModel(), initData.getDevice());
                Log.i("APPModel", "maceMobilenetCreateEngine result = " + result);

                if (result == -1) {

--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java
@@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap
    }

    private void initJni() {
-        AppModel.instance.maceMobilenetSetAttrs(initData);
+        AppModel.instance.maceMobilenetCreateGPUContext(initData);
        AppModel.instance.maceMobilenetCreateEngine(initData, this);
    }


--- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java
+++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java
@@ -29,7 +29,7 @@ public class InitData {
    private int cpuAffinityPolicy;
    private int gpuPerfHint;
    private int gpuPriorityHint;
-    private String kernelPath = "";
+    private String storagePath = "";

    public InitData() {
        model = MODELS[0];
@@ -38,8 +38,8 @@ public class InitData {
        gpuPerfHint = 3;
        gpuPriorityHint = 3;
        device = DEVICES[0];
-        kernelPath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
-        File file = new File(kernelPath);
+        storagePath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
+        File file = new File(storagePath);
        if (!file.exists()) {
            file.mkdir();
        }
@@ -94,11 +94,11 @@ public class InitData {
        this.gpuPriorityHint = gpuPriorityHint;
    }

-    public String getKernelPath() {
-        return kernelPath;
+    public String getStoragePath() {
+        return storagePath;
    }

-    public void setKernelPath(String kernelPath) {
-        this.kernelPath = kernelPath;
+    public void setStoragePath(String storagePath) {
+        this.storagePath = storagePath;
    }
 }
--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
@@ -26,7 +26,6 @@
 #include <numeric>

 #include "src/main/cpp/include/mace/public/mace.h"
-#include "src/main/cpp/include/mace/public/mace_runtime.h"
 #include "src/main/cpp/include/mace/public/mace_engine_factory.h"

 namespace {
@@ -39,8 +38,8 @@ struct ModelInfo {
 };

 struct MaceContext {
+  std::shared_ptr<mace::GPUContext> gpu_context;
  std::shared_ptr<mace::MaceEngine> engine;
-  std::shared_ptr<mace::KVStorageFactory> storage_factory;
  std::string model_name;
  mace::DeviceType device_type = mace::DeviceType::CPU;
  std::map<std::string, ModelInfo> model_infos = {
@@ -72,48 +71,65 @@ MaceContext& GetMaceContext() {

 }  // namespace

-JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs(
-    JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
-    jint gpu_perf_hint, jint gpu_priority_hint, jstring kernel_path) {
+JNIEXPORT jint JNICALL
+Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(
+    JNIEnv *env, jclass thisObj, jstring storage_path) {
  MaceContext &mace_context = GetMaceContext();
+  // DO NOT USE tmp directory.
+  // Please use APP's own directory and make sure the directory exists.
+  const char *storage_path_ptr = env->GetStringUTFChars(storage_path, nullptr);
+  if (storage_path_ptr == nullptr) return JNI_ERR;
+  const std::string storage_file_path(storage_path_ptr);
+  env->ReleaseStringUTFChars(storage_path, storage_path_ptr);

-  mace::MaceStatus status;
-      // openmp
-  status = mace::SetOpenMPThreadPolicy(
-      omp_num_threads,
-      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
-
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "openmp result: %d, threads: %d, cpu: %d",
-                      status, omp_num_threads, cpu_affinity_policy);
-
-  //  gpu
-  mace::SetGPUHints(
-      static_cast<mace::GPUPerfHint>(gpu_perf_hint),
-      static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
-
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "gpu perf: %d, priority: %d",
-                      gpu_perf_hint, gpu_priority_hint);
-
-  //  opencl cache
-  const char *kernel_path_ptr = env->GetStringUTFChars(kernel_path, nullptr);
-  if (kernel_path_ptr == nullptr) return JNI_ERR;
-  const std::string kernel_file_path(kernel_path_ptr);
-  mace_context.storage_factory.reset(
-      new mace::FileStorageFactory(kernel_file_path));
-  mace::SetKVStorageFactory(mace_context.storage_factory);
-  env->ReleaseStringUTFChars(kernel_path, kernel_path_ptr);
+  mace_context.gpu_context = mace::GPUContextBuilder()
+      .SetStoragePath(storage_file_path)
+      .Finalize();

  return JNI_OK;
 }

 JNIEXPORT jint JNICALL
 Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
-    JNIEnv *env, jclass thisObj, jstring model_name_str, jstring device) {
+    JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
+    jint gpu_perf_hint, jint gpu_priority_hint,
+    jstring model_name_str, jstring device) {
  MaceContext &mace_context = GetMaceContext();
+
+  // get device
+  const char *device_ptr = env->GetStringUTFChars(device, nullptr);
+  if (device_ptr == nullptr) return JNI_ERR;
+  mace_context.device_type = ParseDeviceType(device_ptr);
+  env->ReleaseStringUTFChars(device, device_ptr);
+
+  // create MaceEngineConfig
+  mace::MaceStatus status;
+  mace::MaceEngineConfig config(mace_context.device_type);
+  status = config.SetCPUThreadPolicy(
+      omp_num_threads,
+      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
+  if (status != mace::MACE_SUCCESS) {
+    __android_log_print(ANDROID_LOG_ERROR,
+                        "image_classify attrs",
+                        "openmp result: %d, threads: %d, cpu: %d",
+                        status, omp_num_threads, cpu_affinity_policy);
+  }
+  if (mace_context.device_type == mace::DeviceType::GPU) {
+    config.SetGPUContext(mace_context.gpu_context);
+    config.SetGPUHints(
+        static_cast<mace::GPUPerfHint>(gpu_perf_hint),
+        static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
+    __android_log_print(ANDROID_LOG_INFO,
+                        "image_classify attrs",
+                        "gpu perf: %d, priority: %d",
+                        gpu_perf_hint, gpu_priority_hint);
+  }
+
+  __android_log_print(ANDROID_LOG_INFO,
+                      "image_classify attrs",
+                      "device: %d",
+                      mace_context.device_type);
+
  //  parse model name
  const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr);
  if (model_name_ptr == nullptr) return JNI_ERR;
@@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
  std::vector<std::string> input_names = {model_info_iter->second.input_name};
  std::vector<std::string> output_names = {model_info_iter->second.output_name};

-  // get device
-  const char *device_ptr = env->GetStringUTFChars(device, nullptr);
-  if (device_ptr == nullptr) return JNI_ERR;
-  mace_context.device_type = ParseDeviceType(device_ptr);
-  env->ReleaseStringUTFChars(device, device_ptr);
-
-  __android_log_print(ANDROID_LOG_ERROR,
-                      "image_classify attrs",
-                      "device: %d",
-                      mace_context.device_type);
-
  mace::MaceStatus create_engine_status =
      CreateMaceEngineFromCode(mace_context.model_name,
                               std::string(),
                               input_names,
                               output_names,
-                               mace_context.device_type,
+                               config,
                               &mace_context.engine);

-  __android_log_print(ANDROID_LOG_ERROR,
+  __android_log_print(ANDROID_LOG_INFO,
                      "image_classify attrs",
                      "create result: %d",
                      create_engine_status);

--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h
@@ -24,11 +24,13 @@ extern "C" {
 #endif
 /*
 * Class:     com_xiaomi_mace_JniMaceUtils
- * Method:    maceMobilenetSetAttrs
+ * Method:    maceMobilenetCreateGPUContext
 * Signature: (Ljava/lang/String;IIIILjava/lang/String;)I
 */
-JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
-  (JNIEnv *, jclass, jint, jint, jint, jint, jstring);
+JNIEXPORT jint JNICALL
+Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(JNIEnv *,
+                                                                jclass,
+                                                                jstring);

 /*
 * Class:     com_xiaomi_mace_JniMaceUtils
@@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
 */
 JNIEXPORT jint JNICALL
 Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine
-  (JNIEnv *, jclass, jstring, jstring);
+  (JNIEnv *, jclass, jint, jint, jint, jint, jstring, jstring);

 /*
 * Class:     com_xiaomi_mace_JniMaceUtils

--- a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java
+++ b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java
@@ -20,9 +20,9 @@ public class JniMaceUtils {
        System.loadLibrary("mace_mobile_jni");
    }

-    public static native int maceMobilenetSetAttrs(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String kernelPath);
+    public static native int maceMobilenetCreateGPUContext(String storagePath);

-    public static native int maceMobilenetCreateEngine(String model, String device);
+    public static native int maceMobilenetCreateEngine(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String model, String device);

    public static native float[] maceMobilenetClassify(float[] input);


--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -21,7 +21,6 @@

 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/public/mace_runtime.h"
 // if convert model to code.
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -157,40 +156,40 @@ bool RunModel(const std::vector<std::string> &input_names,
              const std::vector<std::vector<int64_t>> &output_shapes) {
  // load model
  DeviceType device_type = ParseDeviceType(FLAGS_device);
-  // config runtime
-  mace::SetOpenMPThreadPolicy(
+  // configuration
+  // Detailed information please see mace.h
+  MaceStatus status;
+  MaceEngineConfig config(device_type);
+  status = config.SetCPUThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+  if (status != MACE_SUCCESS) {
+    std::cerr << "Set openmp or cpu affinity failed." << std::endl;
+  }
 #ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context;
  if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-
-    // Just call once. (Not thread-safe)
-    // Set paths of Generated OpenCL Compiled Kernel Binary file
-    // if you build gpu library of specific soc.
-    // Using OpenCL binary will speed up the initialization.
-    // OpenCL binary is corresponding to the OpenCL Driver version,
-    // you should update the binary when OpenCL Driver changed.
+    // DO NOT USE tmp directory.
+    // Please use APP's own directory and make sure the directory exists.
+    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
+    const std::string storage_path =
+        std::string(storage_path_ptr == nullptr ?
+                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-    mace::SetOpenCLBinaryPaths(opencl_binary_paths);

-    mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
+    gpu_context = GPUContextBuilder()
+        .SetStoragePath(storage_path)
+        .SetOpenCLBinaryPaths(opencl_binary_paths)
+        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
+        .Finalize();
+
+    config.SetGPUContext(gpu_context);
+    config.SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
 #endif  // MACE_ENABLE_OPENCL

-  // DO NOT USE tmp directory.
-  // Please use APP's own directory and make sure the directory exists.
-  // Just call once
-  const std::string internal_storage_path =
-      "/data/local/tmp/mace_run/interior";
-
-  // Config internal kv storage factory.
-  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(internal_storage_path));
-  SetKVStorageFactory(storage_factory);
-
  // Create Engine
  std::shared_ptr<mace::MaceEngine> engine;
  MaceStatus create_engine_status;
@@ -204,7 +203,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                               FLAGS_model_data_file,
                               input_names,
                               output_names,
-                               device_type,
+                               config,
                               &engine);
 #else
  std::vector<unsigned char> model_pb_data;
@@ -216,7 +215,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                                FLAGS_model_data_file,
                                input_names,
                                output_names,
-                                device_type,
+                                config,
                                &engine);
 #endif


--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -126,10 +127,14 @@ template <DeviceType D, typename T>
 class ActivationFunctor;

 template <>
-class ActivationFunctor<DeviceType::CPU, float> {
+class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
 public:
-  ActivationFunctor(ActivationType type, float relux_max_limit)
-      : activation_(type), relux_max_limit_(relux_max_limit) {}
+  ActivationFunctor(OpKernelContext *context,
+                    ActivationType type,
+                    float relux_max_limit)
+      : OpKernel(context),
+        activation_(type),
+        relux_max_limit_(relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,
                        const Tensor *alpha,
@@ -159,10 +164,14 @@ class ActivationFunctor<DeviceType::CPU, float> {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class ActivationFunctor<DeviceType::GPU, T> {
+class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
 public:
-  ActivationFunctor(ActivationType type, T relux_max_limit)
-      : activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}
+  ActivationFunctor(OpKernelContext *context,
+                    ActivationType type,
+                    T relux_max_limit)
+      : OpKernel(context),
+        activation_(type),
+        relux_max_limit_(static_cast<T>(relux_max_limit)) {}

  MaceStatus operator()(const Tensor *input,
                        const Tensor *alpha,

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -24,6 +24,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -35,10 +36,11 @@ namespace kernels {
 constexpr int kCostPerGroup = 1024;

 template <DeviceType D, typename T>
-struct AddNFunctor {
+struct AddNFunctor : OpKernel {
+  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
-                  Tensor *output_tensor,
-                  StatsFuture *future) {
+                        Tensor *output_tensor,
+                        StatsFuture *future) {
    MACE_UNUSED(future);
    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
    index_t size = output_tensor->size();
@@ -95,7 +97,8 @@ struct AddNFunctor {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct AddNFunctor<DeviceType::GPU, T> {
+struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
                  Tensor *output_tensor,
                  StatsFuture *future);

--- a/mace/kernels/argmax.h
+++ b/mace/kernels/argmax.h
@@ -23,6 +23,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"

@@ -30,7 +31,8 @@ namespace mace {
 namespace kernels {

 template <DeviceType D, typename T>
-struct ArgMaxFunctor {
+struct ArgMaxFunctor : OpKernel {
+  explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *axis,
                        Tensor *output,

--- a/mace/kernels/arm/conv_winograd_test.cc
+++ b/mace/kernels/arm/conv_winograd_test.cc
@@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) {
  index_t filter_size = 3 * 3 * in_channels * out_channels;
  index_t output_size = batch * out_channels * out_height * out_width;

-  Tensor input;
-  Tensor filter;
-  Tensor output;
-  Tensor output_ref;
+  Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);

  input.Resize({batch, in_channels, in_height, in_width});
  filter.Resize({out_channels, in_channels, 3, 3});

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -33,11 +33,13 @@
 namespace mace {
 namespace kernels {

-struct BatchNormFunctorBase {
-  BatchNormFunctorBase(bool folded_constant,
+struct BatchNormFunctorBase : OpKernel {
+  BatchNormFunctorBase(OpKernelContext *context,
+                       bool folded_constant,
                       const ActivationType activation,
                       const float relux_max_limit)
-    : folded_constant_(folded_constant),
+    : OpKernel(context),
+      folded_constant_(folded_constant),
      activation_(activation),
      relux_max_limit_(relux_max_limit) {}

@@ -51,10 +53,14 @@ struct BatchNormFunctor;

 template<>
 struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
-  BatchNormFunctor(const bool folded_constant,
+  BatchNormFunctor(OpKernelContext *context,
+                   const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(context,
+                             folded_constant,
+                             activation,
+                             relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,
                  const Tensor *scale,
@@ -132,10 +138,14 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
-  BatchNormFunctor(const bool folded_constant,
+  BatchNormFunctor(OpKernelContext *context,
+                   const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(context,
+                             folded_constant,
+                             activation,
+                             relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                  const Tensor *scale,
                  const Tensor *offset,

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -21,6 +21,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -30,10 +31,10 @@
 namespace mace {
 namespace kernels {

-struct BiasAddFunctorBase {
-  explicit BiasAddFunctorBase(const DataFormat data_format) {
-    data_format_ = data_format;
-  }
+struct BiasAddFunctorBase : OpKernel {
+  BiasAddFunctorBase(OpKernelContext *context,
+                     const DataFormat data_format)
+      : OpKernel(context), data_format_(data_format) {}

  DataFormat data_format_;
 };
@@ -43,8 +44,9 @@ struct BiasAddFunctor;

 template <>
 struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
-  explicit BiasAddFunctor(const DataFormat data_format)
-      : BiasAddFunctorBase(data_format) {}
+  BiasAddFunctor(OpKernelContext *context,
+                 const DataFormat data_format)
+      : BiasAddFunctorBase(context, data_format) {}

  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
@@ -96,8 +98,8 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
-  explicit BiasAddFunctor(const DataFormat data_format)
-      : BiasAddFunctorBase(data_format) {}
+  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
+      : BiasAddFunctorBase(context, data_format) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
                        Tensor *output,

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -20,21 +20,24 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/opencl/common.h"

 namespace mace {
 namespace kernels {

-struct BufferToImageFunctorBase {
-  explicit BufferToImageFunctorBase(const int wino_blk_size)
-    : wino_blk_size_(wino_blk_size) {}
+struct BufferToImageFunctorBase : OpKernel {
+  explicit BufferToImageFunctorBase(OpKernelContext *context,
+                                    const int wino_blk_size)
+    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };

 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(const int wino_blk_size)
-    : BufferToImageFunctorBase(wino_blk_size) {}
+  explicit BufferToImageFunctor(OpKernelContext *context,
+                                const int wino_blk_size)
+      : BufferToImageFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {

 template <typename T>
 struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(const int wino_blk_size)
-      : BufferToImageFunctorBase(wino_blk_size) {}
+  explicit BufferToImageFunctor(OpKernelContext *context,
+                                const int wino_blk_size)
+      : BufferToImageFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -20,13 +20,15 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"

 namespace mace {
 namespace kernels {

 template<DeviceType D, typename T>
-struct ChannelShuffleFunctor {
-  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+struct ChannelShuffleFunctor : OpKernel {
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
+      : OpKernel(context), groups_(groups) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -70,8 +72,9 @@ struct ChannelShuffleFunctor {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct ChannelShuffleFunctor<DeviceType::GPU, T> {
-  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
+      : OpKernel(context), groups_(groups) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -30,15 +31,17 @@
 namespace mace {
 namespace kernels {

-struct ConcatFunctorBase {
-  explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
+struct ConcatFunctorBase : OpKernel {
+  ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
+      : OpKernel(context), axis_(axis) {}

  int32_t axis_;
 };

 template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
-  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  ConcatFunctor(OpKernelContext *context, const int32_t axis)
+      : ConcatFunctorBase(context, axis) {}

  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,
@@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
-  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  ConcatFunctor(OpKernelContext *context, const int32_t axis)
+      : ConcatFunctorBase(context, axis) {}

  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -42,14 +42,16 @@
 namespace mace {
 namespace kernels {

-struct Conv2dFunctorBase {
-  Conv2dFunctorBase(const int *strides,
+struct Conv2dFunctorBase : OpKernel {
+  Conv2dFunctorBase(OpKernelContext *context,
+                    const int *strides,
                    const Padding &padding_type,
                    const std::vector<int> &paddings,
                    const int *dilations,
                    const ActivationType activation,
                    const float relux_max_limit)
-    : strides_(strides),
+    : OpKernel(context),
+      strides_(strides),
      padding_type_(padding_type),
      paddings_(paddings),
      dilations_(dilations),
@@ -69,7 +71,8 @@ struct Conv2dFunctor;

 template<>
 struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,
                const int *dilations,
@@ -77,12 +80,14 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-    : Conv2dFunctorBase(strides,
+    : Conv2dFunctorBase(context,
+                        strides,
                        padding_type,
                        paddings,
                        dilations,
                        activation,
                        relux_max_limit),
+      transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
      is_filter_transformed_(is_filter_transformed),
      scratch_(scratch) {}

@@ -721,7 +726,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

 template<>
 struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,
                const int *dilations,
@@ -729,7 +735,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-      : Conv2dFunctorBase(strides,
+      : Conv2dFunctorBase(context,
+                          strides,
                          padding_type,
                          paddings,
                          dilations,
@@ -949,7 +956,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
+  Conv2dFunctor(OpKernelContext *context,
+                const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,
                const int *dilations,
@@ -957,7 +965,8 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-    : Conv2dFunctorBase(strides,
+    : Conv2dFunctorBase(context,
+                        strides,
                        padding_type,
                        paddings,
                        dilations,
@@ -968,10 +977,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
  }

  MaceStatus operator()(const Tensor *input,
-                  const Tensor *filter,
-                  const Tensor *bias,
-                  Tensor *output,
-                  StatsFuture *future);
+                        const Tensor *filter,
+                        const Tensor *bias,
+                        Tensor *output,
+                        StatsFuture *future);

  cl::Kernel kernel_;
  uint32_t kwg_size_;

--- a/mace/kernels/crop.h
+++ b/mace/kernels/crop.h
@@ -21,6 +21,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -30,10 +31,12 @@
 namespace mace {
 namespace kernels {

-struct CropFunctorBase {
-  CropFunctorBase(const int axis,
+struct CropFunctorBase : OpKernel {
+  CropFunctorBase(OpKernelContext *context,
+                  const int axis,
                  const std::vector<int> &offset)
-      : axis_(axis),
+      : OpKernel(context),
+        axis_(axis),
        offset_(offset) {}

  const int axis_;
@@ -42,8 +45,10 @@ struct CropFunctorBase {

 template <DeviceType D, typename T>
 struct CropFunctor : CropFunctorBase {
-  CropFunctor(const int axis, const std::vector<int> &offset)
-      : CropFunctorBase(axis, offset) {}
+  CropFunctor(OpKernelContext *context,
+              const int axis,
+              const std::vector<int> &offset)
+      : CropFunctorBase(context, axis, offset) {}

  void crop_copy(const T* input_data, T* output_data,
                 const std::vector<index_t> &input_shape,
@@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
-  CropFunctor(const int axis, const std::vector<int> &offset)
-  : CropFunctorBase(axis, offset) {}
+  CropFunctor(OpKernelContext *context,
+              const int axis,
+              const std::vector<int> &offset)
+      : CropFunctorBase(context, axis, offset) {}

  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output,
-                  StatsFuture *future);
+                        Tensor *output,
+                        StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::unique_ptr<BufferBase> kernel_error_;

--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input,
 }
 }  // namespace deconv

-struct Deconv2dFunctorBase {
-  Deconv2dFunctorBase(const std::vector<int> &strides,
+struct Deconv2dFunctorBase : OpKernel {
+  Deconv2dFunctorBase(OpKernelContext *context,
+                      const std::vector<int> &strides,
                      const Padding &padding_type,
                      const std::vector<int> &paddings,
                      const std::vector<index_t> &output_shape,
                      const ActivationType activation,
                      const float relux_max_limit)
-      : strides_(strides),
+      : OpKernel(context),
+        strides_(strides),
        padding_type_(padding_type),
        paddings_(paddings),
        output_shape_(output_shape),
@@ -210,13 +212,15 @@ struct Deconv2dFunctorBase {

 template <DeviceType D, typename T>
 struct Deconv2dFunctor : Deconv2dFunctorBase {
-  Deconv2dFunctor(const std::vector<int> &strides,
+  Deconv2dFunctor(OpKernelContext *context,
+                  const std::vector<int> &strides,
                  const Padding &padding_type,
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
                  const float relux_max_limit)
-      : Deconv2dFunctorBase(strides,
+      : Deconv2dFunctorBase(context,
+                            strides,
                            padding_type,
                            paddings,
                            output_shape,
@@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
-  Deconv2dFunctor(const std::vector<int> &strides,
+  Deconv2dFunctor(OpKernelContext *context,
+                  const std::vector<int> &strides,
                  const Padding &padding_type,
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
                  const float relux_max_limit)
-      : Deconv2dFunctorBase(strides,
+      : Deconv2dFunctorBase(context,
+                            strides,
                            padding_type,
                            paddings,
                            output_shape,

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -19,6 +19,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -29,9 +30,11 @@ namespace mace {
 namespace kernels {

 template<DeviceType D, typename T>
-struct DepthToSpaceOpFunctor {
-  explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
-      : block_size_(block_size), d2s_(d2s) {}
+struct DepthToSpaceOpFunctor : OpKernel {
+  DepthToSpaceOpFunctor(OpKernelContext *context,
+                        const int block_size,
+                        bool d2s)
+      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future) {
@@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
-  DepthToSpaceOpFunctor(const int block_size, bool d2s)
-      : block_size_(block_size), d2s_(d2s) {}
+struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
+  DepthToSpaceOpFunctor(OpKernelContext *context,
+                        const int block_size,
+                        bool d2s)
+      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -37,14 +37,16 @@
 namespace mace {
 namespace kernels {

-struct DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctorBase(const int *strides,
+struct DepthwiseConv2dFunctorBase : OpKernel {
+  DepthwiseConv2dFunctorBase(OpKernelContext *context,
+                             const int *strides,
                             const Padding padding_type,
                             const std::vector<int> &paddings,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit)
-    : strides_(strides),
+    : OpKernel(context),
+      strides_(strides),
      padding_type_(padding_type),
      paddings_(paddings),
      dilations_(dilations),
@@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor;
 template<>
 struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
  : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                         const Padding padding_type,
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(strides,
+    : DepthwiseConv2dFunctorBase(context,
+                                 strides,
                                 padding_type,
                                 paddings,
                                 dilations,
@@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
 template<>
 struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
    : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                         const Padding padding_type,
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit)
-      : DepthwiseConv2dFunctorBase(strides,
+      : DepthwiseConv2dFunctorBase(context,
+                                   strides,
                                   padding_type,
                                   paddings,
                                   dilations,
@@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
      const int32_t *bias_data = nullptr;
      if (bias == nullptr) {
        zero_bias.reset(
-            new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
+            new Tensor(GetCPUAllocator(), DT_INT32));
        zero_bias->Resize(bias_shape);
        zero_bias->Clear();
        bias_data = zero_bias->data<int32_t>();
@@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
 template<typename T>
 struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
  : DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
+  DepthwiseConv2dFunctor(OpKernelContext *context,
+                         const int *strides,
                         const Padding padding_type,
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(strides,
+    : DepthwiseConv2dFunctorBase(context,
+                                 strides,
                                 padding_type,
                                 paddings,
                                 dilations,

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -23,6 +23,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
  }
 }

-struct EltwiseFunctorBase {
-  EltwiseFunctorBase(const EltwiseType type,
+struct EltwiseFunctorBase : OpKernel {
+  EltwiseFunctorBase(OpKernelContext *context,
+                     const EltwiseType type,
                     const std::vector<float> &coeff,
                     const float scalar_input,
                     const int32_t scalar_input_index,
                     const DataFormat data_format)
-      : type_(type),
+      : OpKernel(context),
+        type_(type),
        coeff_(coeff),
        scalar_input_(scalar_input),
        scalar_input_index_(scalar_input_index),
@@ -823,12 +826,14 @@ struct EltwiseFunctorBase {

 template <DeviceType D, typename T>
 struct EltwiseFunctor : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(OpKernelContext *context,
+                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,  // float as it comes from arg
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(type,
+      : EltwiseFunctorBase(context,
+                           type,
                           coeff,
                           scalar_input,
                           scalar_input_index,
@@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(OpKernelContext *context,
+                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(type,
+      : EltwiseFunctorBase(context,
+                           type,
                           coeff,
                           scalar_input,
                           scalar_input_index,

--- a/mace/kernels/fill.h
+++ b/mace/kernels/fill.h
@@ -21,6 +21,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

 namespace mace {
@@ -30,8 +31,8 @@ template <DeviceType D, class T>
 struct FillFunctor;

 template <>
-struct FillFunctor<DeviceType::CPU, float> {
-  FillFunctor() {}
+struct FillFunctor<DeviceType::CPU, float> : OpKernel {
+  explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {}

  MaceStatus operator()(const Tensor *shape,
                        const Tensor *value,

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -27,10 +27,12 @@
 namespace mace {
 namespace kernels {

-struct FullyConnectedBase {
-  FullyConnectedBase(const ActivationType activation,
+struct FullyConnectedBase : OpKernel {
+  FullyConnectedBase(OpKernelContext *context,
+                     const ActivationType activation,
                     const float relux_max_limit)
-      : activation_(activation),
+      : OpKernel(context),
+        activation_(activation),
        relux_max_limit_(relux_max_limit) {}

  const ActivationType activation_;
@@ -42,9 +44,10 @@ struct FullyConnectedFunctor;

 template <>
 struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,
                        const Tensor *weight,
@@ -86,9 +89,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {

 template <>
 struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,
                        const Tensor *weight,
@@ -117,7 +121,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
    const int32_t *bias_ptr = nullptr;
    if (bias == nullptr) {
      zero_bias.reset(
-          new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
+          new Tensor(GetCPUAllocator(), DT_INT32));
      zero_bias->Resize(bias_shape);
      zero_bias->Clear();
      bias_ptr = zero_bias->data<int32_t>();
@@ -148,9 +152,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(OpKernelContext *context,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(context, activation, relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,
                  const Tensor *weight,

--- a/mace/kernels/gather.h
+++ b/mace/kernels/gather.h
@@ -21,13 +21,15 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace kernels {

-struct GatherBase {
-  explicit GatherBase(int axis, float y) : axis_(axis), y_(y) {}
+struct GatherBase : OpKernel {
+  GatherBase(OpKernelContext *context, int axis, float y)
+      : OpKernel(context), axis_(axis), y_(y) {}

  int axis_;
  float y_;
@@ -38,7 +40,8 @@ struct GatherFunctor;

 template <>
 struct GatherFunctor<DeviceType::CPU, float> : GatherBase {
-  explicit GatherFunctor(int axis, float y) : GatherBase(axis, y) {}
+  GatherFunctor(OpKernelContext *context, int axis, float y)
+      : GatherBase(context, axis, y) {}

  MaceStatus operator()(const Tensor *params,
                        const Tensor *indices,

--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -1341,8 +1341,8 @@ void Gemm(const float *A,
              ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k);
          const index_t ik_end = std::min(K, ik_begin + this_block_size_k);

-          Tensor trans_a;
-          Tensor trans_b;
+          Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
+          Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
          const float *real_a = nullptr;
          const float *real_b = nullptr;
          float *real_c = c_base + (ih_begin * width + iw_begin);
@@ -1399,8 +1399,8 @@ void GemmRef(const float *A,
             const bool transpose_b) {
  memset(C, 0, sizeof(float) * batch * height * width);

-  Tensor trans_a;
-  Tensor trans_b;
+  Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
  float *trans_a_data = nullptr;
  float *trans_b_data = nullptr;
  if (transpose_a) {

--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -20,21 +20,24 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/kernels/opencl/common.h"

 namespace mace {
 namespace kernels {

-struct ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctorBase(const int wino_blk_size)
-    : wino_blk_size_(wino_blk_size) {}
+struct ImageToBufferFunctorBase : OpKernel {
+  ImageToBufferFunctorBase(OpKernelContext *context,
+                           const int wino_blk_size)
+    : OpKernel(context),
+      wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };

 template <DeviceType D, typename T>
 struct ImageToBufferFunctor : ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctor(const int wino_blk_size)
-    : ImageToBufferFunctorBase(wino_blk_size) {}
+  ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
+    : ImageToBufferFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
@@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {

 template <typename T>
 struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
-  explicit ImageToBufferFunctor(const int wino_blk_size)
-    : ImageToBufferFunctorBase(wino_blk_size) {}
+  ImageToBufferFunctor(OpKernelContext *context,
+                                const int wino_blk_size)
+      : ImageToBufferFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,

--- a/mace/kernels/kernel.h
+++ b/mace/kernels/kernel.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_KERNEL_H_
+#define MACE_KERNELS_KERNEL_H_
+
+#include "mace/core/op_kernel_context.h"
+
+namespace mace {
+namespace kernels {
+
+struct OpKernel {
+  explicit OpKernel(OpKernelContext *context): context_(context) {}
+
+  OpKernelContext *context_;
+};
+
+}  // namespace kernels
+}  // namespace mace
+#endif  //  MACE_KERNELS_KERNEL_H_
--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -21,7 +21,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/public/mace.h"
+#include "mace/kernels/kernel.h"

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -34,7 +34,9 @@ template<DeviceType D, typename T>
 struct LocalResponseNormFunctor;

 template<>
-struct LocalResponseNormFunctor<DeviceType::CPU, float> {
+struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
+  explicit LocalResponseNormFunctor(OpKernelContext *context)
+      : OpKernel(context) {}
  MaceStatus operator()(const Tensor *input,
                  int depth_radius,
                  float bias,

--- a/mace/kernels/lstmcell.h
+++ b/mace/kernels/lstmcell.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"

 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -35,9 +36,10 @@ template <DeviceType D, typename T>
 struct LSTMCellFunctor;

 template <typename T>
-struct LSTMCellFunctor<DeviceType::GPU, T> {
-  explicit LSTMCellFunctor(T forget_bias) :
-    forget_bias_(static_cast<T>(forget_bias)) {}
+struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
+  LSTMCellFunctor(OpKernelContext *context, T forget_bias)
+      : OpKernel(context),
+        forget_bias_(static_cast<T>(forget_bias)) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *pre_output,
                        const Tensor *weight,

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -29,6 +29,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/gemm.h"
+#include "mace/kernels/kernel.h"
 #include "mace/utils/utils.h"
 #include "mace/kernels/gemmlowp_util.h"

@@ -40,7 +41,8 @@ namespace mace {
 namespace kernels {

 template <DeviceType D, typename T>
-struct MatMulFunctor {
+struct MatMulFunctor : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *A,
                        const Tensor *B,
                        Tensor *C,
@@ -87,7 +89,7 @@ struct MatMulFunctor {
      // A * B = (B^T * A^T)^T
      if (!transpose_b) {
        if (B_transpose_.get() == nullptr) {
-          B_transpose_.reset(new Tensor(GetDeviceAllocator(D),
+          B_transpose_.reset(new Tensor(context_->device()->allocator(),
                                        DataTypeToEnum<T>::v()));
          B_transpose_->Resize({batch, width, K});
          Tensor::MappingGuard guardbt(B_transpose_.get());
@@ -112,7 +114,8 @@ struct MatMulFunctor {
 };

 template <>
-struct MatMulFunctor<CPU, uint8_t> {
+struct MatMulFunctor<CPU, uint8_t> : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
  template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
  void MatMulImpl(const Tensor *A,
                  const Tensor *B,
@@ -208,7 +211,8 @@ struct MatMulFunctor<CPU, uint8_t> {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct MatMulFunctor<DeviceType::GPU, T> {
+struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
  MaceStatus operator()(const Tensor *A,
                        const Tensor *B,
                        Tensor *C,

--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -33,11 +33,11 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(

  const index_t channel_blocks = RoundUpDiv4(channels);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
    built_options.emplace("-Dactivation=" + kernel_name);
@@ -94,12 +94,12 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
-                                           lws, future));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -34,7 +34,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
  const index_t width = input_tensors[0]->dim(2);
  const index_t channels = input_tensors[0]->dim(3);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  for (size_t i = 1; i < size; ++i) {
    MACE_CHECK_NOTNULL(input_tensors[i]);
@@ -49,7 +49,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
      MACE_NOT_IMPLEMENTED;
    }
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
@@ -96,7 +96,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
             output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
@@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
    built_options.emplace("-Dbias_add=" + kernel_name);
@@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);

  cl::Event event;
  cl_int error;

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    }
  }

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
@@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -22,13 +22,15 @@ namespace mace {
 namespace kernels {

 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
@@ -41,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {

 }  // namespace

-static MaceStatus Concat2(cl::Kernel *kernel,
+static MaceStatus Concat2(OpKernelContext *context,
+                          cl::Kernel *kernel,
                          const Tensor *input0,
                          const Tensor *input1,
                          const DataType dt,
@@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel,
      static_cast<uint32_t>(batch * height),
  };

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
    built_options.emplace("-Dconcat_channel=" + kernel_name);
@@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel,
    *prev_input_shape = input0->shape();
  }

-  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
  return MACE_SUCCESS;
 }

-static MaceStatus ConcatN(cl::Kernel *kernel,
+static MaceStatus ConcatN(OpKernelContext *context,
+                          cl::Kernel *kernel,
                          const std::vector<const Tensor *> &input_list,
                          const DataType dt,
                          Tensor *output,
@@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
@@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
        static_cast<uint32_t>(batch * height),
    };
-    const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+    const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);

    uint32_t idx = 0;
    OUT_OF_RANGE_SET_ARG_PTR;
@@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
      for (size_t j = 0; j < 3; ++j) {
        roundup_gws[j] = RoundUp(gws[j], lws[j]);
      }
-      const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
-
      error = runtime->command_queue().enqueueNDRangeKernel(
          *kernel, cl::NullRange,
          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
@@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
    }
  }
  if (future != nullptr) {
-    future->wait_fn = [runtime, call_stats](CallStats *stats) {
+    future->wait_fn = [call_stats](CallStats *stats) {
      if (stats != nullptr) {
        stats->start_micros = call_stats.start_micros;
        stats->end_micros = stats->start_micros + call_stats.end_micros;
@@ -234,12 +236,14 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(

  switch (inputs_count) {
    case 2:
-      return Concat2(&kernel_, input_list[0], input_list[1],
+      return Concat2(context_,
+                     &kernel_, input_list[0], input_list[1],
                     DataTypeToEnum<T>::value, &input_shape_, output, future,
                     &kwg_size_, &kernel_error_);
    default:
      if (divisible_four) {
-        return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
+        return ConcatN(context_,
+                       &kernel_, input_list, DataTypeToEnum<T>::value, output,
                       future, &kwg_size_, &kernel_error_);
      } else {
        MACE_NOT_IMPLEMENTED;

--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
@@ -18,7 +18,8 @@
 namespace mace {
 namespace kernels {

-extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
                                   uint32_t *kwg_size,
                                   std::unique_ptr<BufferBase> *kernel_error);

-extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
                                   uint32_t *kwg_size,
                                   std::unique_ptr<BufferBase> *kernel_error);

-extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpencl(OpKernelContext *runtime,
+                               cl::Kernel *kernel,
                               const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
@@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                         Tensor *output,
                                                         StatsFuture *future) {
  typedef MaceStatus (*Conv2dOpenclFunction)(
-      cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
-      const Tensor *bias, const int stride, const int *padding,
-      const int *dilations, const ActivationType activation,
+      OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input,
+      const Tensor *filter, const Tensor *bias, const int stride,
+      const int *padding, const int *dilations,
+      const ActivationType activation,
      const float relux_max_limit, const DataType dt,
      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
      uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
@@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  if (kernel_h == kernel_w && kernel_h <= 3 &&
      selector[kernel_h - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1];
-    return conv2d_func(
+    return conv2d_func(context_,
        &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
        activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
        output, future, &kwg_size_, &kernel_error_);
  } else {
-    return Conv2dOpencl(
+    return Conv2dOpencl(context_,
        &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
        activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
        output, future, &kwg_size_, &kernel_error_);

--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -25,14 +25,16 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 // TODO(liuqi): Fix the specific value.
 const uint32_t lws_limit = 128;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t compute_units = runtime->device_compute_units();
    const uint32_t base =
        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
@@ -62,7 +64,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {

 }  // namespace

-extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
  const index_t width_blocks = RoundUpDiv4(width);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    MACE_CHECK(input_batch == batch);

    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
@@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -24,15 +24,17 @@ namespace kernels {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t compute_units = std::max<uint32_t>(
-        OpenCLRuntime::Global()->device_compute_units() / 2, 1);
+        runtime->device_compute_units() / 2, 1);
    const uint32_t base =
        std::max<uint32_t>(
            std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
@@ -55,7 +57,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {

 }  // namespace

-extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
+                                   cl::Kernel *kernel,
                                   const Tensor *input,
                                   const Tensor *filter,
                                   const Tensor *bias,
@@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
@@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(*kernel_error);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -26,7 +26,8 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 // TODO(liuqi): Fix the specific value.
 const uint32_t lws_limit = 20;
-std::vector<uint32_t> LocalWS(const uint32_t *gws,
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
                              const uint32_t kernel_size,
                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
@@ -34,8 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t compute_units = runtime->device_compute_units();
    const uint32_t base =
        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
@@ -64,7 +65,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,

 }  // namespace

-extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
+extern MaceStatus Conv2dOpencl(OpKernelContext *context,
+                               cl::Kernel *kernel,
                               const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
@@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv4(width);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
    built_options.emplace("-Dconv_2d=" + kernel_name);
@@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
      Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
  std::vector<uint32_t> lws =
-      LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+      LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(*kernel_error);

--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
@@ -22,13 +22,15 @@ namespace mace {
 namespace kernels {

 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
@@ -132,11 +134,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
      static_cast<uint32_t>(output->dim(0) * output->dim(1))
  };

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
    built_options.emplace("-Dcrop=" + kernel_name);
@@ -167,11 +169,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input0->shape();
  }

-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
@@ -20,7 +20,8 @@ namespace kernels {

 namespace {

-MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
+MaceStatus Deconv2dOpencl(OpKernelContext *context,
+                          cl::Kernel *kernel,
                          const Tensor *input,
                          const Tensor *filter,
                          const Tensor *bias,
@@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
  const int align_w = stride_w - 1 - padding_w;
  const int kernel_size = filter->dim(2) * filter->dim(3);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
    built_options.emplace("-Ddeconv_2d=" + kernel_name);
@@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
                  &output_image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));

-  return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(),
-                        paddings.data(), activation_, relux_max_limit_,
-                        DataTypeToEnum<T>::value, &input_shape_, output, future,
+  return Deconv2dOpencl(context_, &kernel_, input, filter, bias,
+                        strides_.data(), paddings.data(), activation_,
+                        relux_max_limit_, DataTypeToEnum<T>::value,
+                        &input_shape_, output, future,
                        &kwg_size_, &kernel_error_);
 }


--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
@@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -24,13 +24,15 @@ namespace kernels {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = cache_size / kBaseGPUMemCacheSize;
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (lws[1] >= base) {
@@ -58,7 +60,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {

 }  // namespace

-static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
+static MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                                  cl::Kernel *kernel,
                                  const Tensor *input,   // NHWC
                                  const Tensor *filter,  // HWIM
                                  const Tensor *bias,
@@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
  index_t kernel_h = filter->dim(2);
  index_t kernel_w = filter->dim(3);
  if (strides_[0] != strides_[1]) {
-    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides_[0] << "x" << strides_[1]
-                 << " is not implemented yet, using slow version";
-    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
-    return DepthwiseConv2dFunctor<DeviceType::CPU, float>(
-        strides_, padding_type_, paddings_, dilations_, activation_,
-        relux_max_limit_)(input, filter, bias, output, future);
+    LOG(FATAL) << "GPU depthwise conv2d kernel with "
+               << "filter" << kernel_h << "x" << kernel_w << ","
+               << " stride " << strides_[0] << "x" << strides_[1]
+               << " is not implemented yet.";
  }

  // Create a fake conv_2d filter to calculate the paddings and output size
@@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));

  return DepthwiseConv2d(
+      context_,
      &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
      activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
      output, future, &kwg_size_, &kernel_error_);

--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(batch_height_pixels)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
    input_shape_ = input0->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);
  return MACE_SUCCESS;

--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -22,7 +22,8 @@ namespace kernels {

 namespace {
 template <typename T>
-MaceStatus FCWXKernel(cl::Kernel *kernel,
+MaceStatus FCWXKernel(OpKernelContext *context,
+                      cl::Kernel *kernel,
                      const Tensor *input,
                      const Tensor *weight,
                      const Tensor *bias,
@@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
                      std::unique_ptr<BufferBase> *kernel_error) {
  MACE_CHECK_NOTNULL(gws);
  MACE_CHECK_NOTNULL(lws);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  if (kernel->get() == nullptr) {
    const index_t batch = output->dim(0);
@@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
    const index_t output_blocks = RoundUpDiv4(output_size);

    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
@@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
 }

 template <typename T>
-MaceStatus FCWTXKernel(cl::Kernel *kernel,
+MaceStatus FCWTXKernel(OpKernelContext *context,
+                       cl::Kernel *kernel,
                       const Tensor *input,
                       const Tensor *weight,
                       const Tensor *bias,
@@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
                       std::unique_ptr<BufferBase> *kernel_error) {
  MACE_CHECK_NOTNULL(gws);
  MACE_CHECK_NOTNULL(lws);
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error);
+    OUT_OF_RANGE_CONFIG(*kernel_error, context);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
@@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
  std::string tuning_key =
      Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
                                           gws->data(), *lws, future));

  OUT_OF_RANGE_VALIDATION(*kernel_error);
@@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
                  &output_image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));

-  return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
+  return FCWXKernel<T>(context_,
+                       &kernel_, input, weight, bias, &input_shape_, output,
                       activation_, &gws_, &lws_, relux_max_limit_, future,
                       &kernel_error_);
 }

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
  }
 }

-std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
+                                       const uint32_t *gws,
                                       const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
-    uint64_t cache_size =
-        OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint64_t cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[2] =
@@ -245,13 +245,12 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
  return lws;
 }

-MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
                               StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
-
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
    }
    return error;
  };
-  OpenCLProfilingTimer timer(&event);
-  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  OpenCLProfilingTimer timer(runtime, &event);
+  cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);
  MACE_CL_RET_STATUS(err);

  if (future != nullptr) {
-    future->wait_fn = [event](CallStats *stats) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();
      if (stats != nullptr) {
-        OpenCLRuntime::Global()->GetCallStats(event, stats);
+        runtime->GetCallStats(event, stats);
      }
    };
  }
  return MaceStatus::MACE_SUCCESS;
 }

-MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
                               StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
-
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
    }
    return error;
  };
-  OpenCLProfilingTimer timer(&event);
-  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  OpenCLProfilingTimer timer(runtime, &event);
+  cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);
  MACE_CL_RET_STATUS(err);


--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -31,11 +31,11 @@
 namespace mace {
 namespace kernels {

-#define OUT_OF_RANGE_CONFIG(kernel_error)                   \
+#define OUT_OF_RANGE_CONFIG(kernel_error, context)          \
  if (runtime->IsOutOfRangeCheckEnabled()) {                \
    built_options.emplace("-DOUT_OF_RANGE_CHECK");          \
    (kernel_error) = std::move(std::unique_ptr<Buffer>(     \
-        new Buffer(GetDeviceAllocator(DeviceType::GPU))));  \
+        new Buffer((context)->device()->allocator())));     \
    MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1));      \
    (kernel_error)->Map(nullptr);                           \
    *((kernel_error)->mutable_data<char>()) = 0;            \
@@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt);
 std::string DtToUpCompatibleCLDt(const DataType dt);

 // Tuning or Run OpenCL kernel with 3D work group size
-MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
                               StatsFuture *future);

 // Tuning or Run OpenCL kernel with 2D work group size
-MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
+                               const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
@@ -162,7 +164,8 @@ std::string Concat(Args... args) {
  return ss.str();
 }

-std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
+                                       const uint32_t *gws,
                                       const uint32_t kwg_size);
 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
      break;
  }

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;

--- a/mace/kernels/opencl/lstmcell.cc
+++ b/mace/kernels/opencl/lstmcell.cc
@@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
  const index_t width = input->dim(1);
  const index_t width_blocks = width / 4;

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
@@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
  std::string tuning_key =
      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));
  OUT_OF_RANGE_VALIDATION(kernel_error_);


--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -53,11 +53,11 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
      static_cast<uint32_t>(height_blocks * batch),
  };

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
@@ -84,7 +84,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,

  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -16,6 +16,8 @@
 #include <vector>

 #include "gtest/gtest.h"
+#include "mace/core/op_kernel_context.h"
+#include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
@@ -25,14 +27,15 @@ namespace mace {
 namespace kernels {
 namespace {

-bool BufferToImageOpImpl(Tensor *buffer,
+bool BufferToImageOpImpl(OpKernelContext *context,
+                         Tensor *buffer,
                         Tensor *image,
                         const std::vector<size_t> &image_shape) {
  std::unique_ptr<BufferBase> kernel_error;
  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                     static_cast<uint32_t>(image_shape[1])};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context->device()->opencl_runtime();

  std::string kernel_name = "in_out_buffer_to_image";
  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
@@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
  std::stringstream kernel_name_ss;
  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
  built_options.emplace(kernel_name_ss.str());
-  OUT_OF_RANGE_CONFIG(kernel_error);
+  OUT_OF_RANGE_CONFIG(kernel_error, context);
  NON_UNIFORM_WG_CONFIG;
  if (buffer->dtype() == image->dtype()) {
    built_options.emplace("-DDATA_TYPE=" +
@@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) {
  index_t width = 7;
  index_t channels = 11;

-  std::vector<index_t> buffer_shape = {batch, height, width, channels};
+  GPUContext gpu_context;
+  std::unique_ptr<Device> device(new GPUDevice(gpu_context.opencl_tuner()));
+
  Workspace ws;
+  OpKernelContext context(&ws, device.get());
+
+  std::vector<index_t> buffer_shape = {batch, height, width, channels};
  Tensor *buffer =
-      ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
+      ws.CreateTensor("Buffer", device->allocator(),
                      DataTypeToEnum<float>::v());
  buffer->Resize(buffer_shape);

  std::vector<size_t> image_shape;
-  Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
+  Tensor *image = ws.CreateTensor("Image", device->allocator(),
                                  DataTypeToEnum<float>::v());
  CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
  image->ResizeImage(buffer->shape(), image_shape);
-  ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape));
+  ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape));

  std::vector<size_t> overflow_image_shape = image_shape;
  for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
    overflow_image_shape[i] += 1;
  }
-  ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape));
+  ASSERT_TRUE(BufferToImageOpImpl(&context,
+                                  buffer,
+                                  image,
+                                  overflow_image_shape));
 }

 }  // namespace kernels

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -47,11 +47,11 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,

  const index_t channel_blocks = RoundUpDiv4(channels);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
    built_options.emplace("-Dpad=" + kernel_name);
@@ -85,10 +85,10 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
                                  output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -23,13 +23,15 @@ namespace kernels {

 namespace {

-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[2] =
@@ -54,12 +56,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
      << "Pooling opencl kernel not support dilation yet";

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    const DataType dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
    built_options.emplace("-Dpooling=" + kernel_name);
@@ -149,11 +151,11 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    };
  }

-  const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws.data(), kwg_size_);
  std::string tuning_key =
      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws.data(), lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/reduce_mean.cc
+++ b/mace/kernels/opencl/reduce_mean.cc
@@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
  const index_t channel_blocks = RoundUpDiv4(channels);
  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();
  std::vector<uint32_t> gws(3);
  std::vector<uint32_t> lws(3);
  std::vector<index_t> output_shape{batch, 1, 1, channels};
@@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
  if (kernel_.get() == nullptr) {
    const DataType dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
    built_options.emplace("-Dreduce_mean=" + kernel_name);

--- a/mace/kernels/opencl/resize_bicubic.cc
+++ b/mace/kernels/opencl/resize_bicubic.cc
@@ -23,9 +23,11 @@ namespace mace {
 namespace kernels {

 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint64_t cache_size = runtime->device_global_mem_cache_size();
  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
  if (lws[1] >= base) {
@@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(out_width),
                           static_cast<uint32_t>(out_height * batch)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
+    auto dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
    built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
@@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
          Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
                 output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -23,13 +23,15 @@ namespace mace {
 namespace kernels {

 namespace {
-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (lws[1] >= base) {
@@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
                           static_cast<uint32_t>(out_width),
                           static_cast<uint32_t>(out_height * batch)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
@@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -24,13 +24,15 @@ namespace kernels {

 namespace {

-std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  if (kwg_size == 0) {
    lws[0] = lws[1] = lws[2] = 1;
  } else {
    uint64_t
-        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+        cache_size = runtime->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (gws[0] < base) {
@@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
    built_options.emplace("-Dsoftmax=" + kernel_name);
@@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
    input_shape_ = logits->shape();
  }

-  std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
@@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    space_shape_ = space_tensor->shape();
  }

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  std::string tuning_key =
      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/opencl/split.cc
+++ b/mace/kernels/opencl/split.cc
@@ -40,11 +40,11 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
        output_list[i]->ResizeImage(output_shape, image_shape));
  }

-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
    built_options.emplace("-Dsplit=" + kernel_name);
@@ -66,7 +66,7 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
  };

-  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  cl::Event event;
  CallStats call_stats{INT64_MAX, 0};
  for (size_t i = 0; i < outputs_count; ++i) {

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -24,12 +24,12 @@ namespace kernels {
 template <typename T>
 MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    if (wino_blk_size_ == 4) {
      obfuscated_kernel_name =
@@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
                                  output_tensor->dim(0),
                                  output_tensor->dim(1),
                                  output_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);
@@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor*> &inputs,
    Tensor *output_tensor,
    StatsFuture *future) {
-  auto runtime = OpenCLRuntime::Global();
+  auto runtime = context_->device()->opencl_runtime();

  const Tensor *input_tensor = inputs[0];
  const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
@@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name;
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_);
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
    if (wino_blk_size_ == 4) {
      obfuscated_kernel_name =
@@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
             output_tensor->dim(1), output_tensor->dim(2),
             output_tensor->dim(3), input_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

  OUT_OF_RANGE_VALIDATION(kernel_error_);

--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -21,6 +21,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -29,10 +30,13 @@
 namespace mace {
 namespace kernels {

-struct PadFunctorBase {
-  PadFunctorBase(const std::vector<int> &paddings,
+struct PadFunctorBase : OpKernel {
+  PadFunctorBase(OpKernelContext *context,
+                 const std::vector<int> &paddings,
                 const float constant_value)
-      : paddings_(paddings), constant_value_(constant_value) {}
+      : OpKernel(context),
+        paddings_(paddings),
+        constant_value_(constant_value) {}

  std::vector<int> paddings_;
  float constant_value_;
@@ -40,9 +44,10 @@ struct PadFunctorBase {

 template<DeviceType D, typename T>
 struct PadFunctor : public PadFunctorBase {
-  PadFunctor(const std::vector<int> &paddings,
+  PadFunctor(OpKernelContext *context,
+             const std::vector<int> &paddings,
             const float constant_value)
-      : PadFunctorBase(paddings, constant_value) {}
+      : PadFunctorBase(context, paddings, constant_value) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
-  PadFunctor(const std::vector<int> &paddings,
+  PadFunctor(OpKernelContext *context,
+             const std::vector<int> &paddings,
             const float constant_value)
-      : PadFunctorBase(paddings, constant_value) {}
+      : PadFunctorBase(context, paddings, constant_value) {}

  MaceStatus operator()(const Tensor *input,
                  Tensor *output,

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -23,6 +23,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/kernel.h"

 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -41,14 +42,16 @@ enum PoolingType {

 namespace kernels {

-struct PoolingFunctorBase {
-  PoolingFunctorBase(const PoolingType pooling_type,
+struct PoolingFunctorBase : OpKernel {
+  PoolingFunctorBase(OpKernelContext *context,
+                     const PoolingType pooling_type,
                     const int *kernels,
                     const int *strides,
                     const Padding padding_type,
                     const std::vector<int> &paddings,
                     const int *dilations)
-      : pooling_type_(pooling_type),
+      : OpKernel(context),
+        pooling_type_(pooling_type),
        kernels_(kernels),
        strides_(strides),
        padding_type_(padding_type),
@@ -68,14 +71,20 @@ struct PoolingFunctor;

 template <>
 struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(
-            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+      : PoolingFunctorBase(context,
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {
  }

  void MaxPooling(const float *input,
@@ -231,15 +240,20 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {

 template <>
 struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(
-      pooling_type, kernels, strides, padding_type, paddings, dilations) {
-  }
+      : PoolingFunctorBase(context,
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {}

  void MaxPooling(const uint8_t *input,
                  const index_t *in_shape,
@@ -443,14 +457,20 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
+  PoolingFunctor(OpKernelContext *context,
+                 const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(
-            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+      : PoolingFunctorBase(context,
+                           pooling_type,
+                           kernels,
+                           strides,
+                           padding_type,
+                           paddings,
+                           dilations) {
  }
  MaceStatus operator()(const Tensor *input_tensor,
                  Tensor *output_tensor,

--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -21,6 +21,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

 namespace mace {
@@ -121,8 +122,9 @@ inline std::vector<int> nms(const float *bboxes_ptr,


 template<DeviceType D, typename T>
-struct ProposalFunctor {
-  ProposalFunctor(const int min_size,
+struct ProposalFunctor : OpKernel {
+  ProposalFunctor(OpKernelContext *context,
+                  const int min_size,
                  const float nms_thresh,
                  const int pre_nms_top_n,
                  const int post_nms_top_n,
@@ -130,6 +132,7 @@ struct ProposalFunctor {
                  const int base_size,
                  const std::vector<int> &scales,
                  const std::vector<float> &ratios) :
+      OpKernel(context),
      min_size_(min_size),
      thresh_(nms_thresh),
      pre_nms_top_n_(pre_nms_top_n),

--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -21,6 +21,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"

 namespace mace {
 namespace kernels {
@@ -173,8 +174,8 @@ template<DeviceType D, typename T>
 struct QuantizeFunctor;

 template<>
-struct QuantizeFunctor<CPU, uint8_t> {
-  QuantizeFunctor() {}
+struct QuantizeFunctor<CPU, uint8_t> : OpKernel {
+  explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {}

  MaceStatus operator()(const Tensor *input,
                        const bool non_zero,
@@ -212,8 +213,8 @@ template<DeviceType D, typename T>
 struct DequantizeFunctor;

 template<>
-struct DequantizeFunctor<CPU, uint8_t> {
-  DequantizeFunctor() {}
+struct DequantizeFunctor<CPU, uint8_t> : OpKernel {
+  explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/reduce_mean.h
+++ b/mace/kernels/reduce_mean.h
@@ -24,6 +24,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
 #endif
@@ -31,10 +32,12 @@
 namespace mace {
 namespace kernels {

-struct ReduceFunctorBase {
-  ReduceFunctorBase(const std::vector<int> &axis,
+struct ReduceFunctorBase : OpKernel {
+  ReduceFunctorBase(OpKernelContext *context,
+                    const std::vector<int> &axis,
                    const bool keep_dims)
-      : keep_dims_(keep_dims),
+      : OpKernel(context),
+        keep_dims_(keep_dims),
        axis_(axis) {}
  bool keep_dims_;
  bool reduce_first_axis_;
@@ -44,10 +47,11 @@ struct ReduceFunctorBase {
 };

 template <DeviceType D, typename T>
-struct ReduceMeanFunctor : ReduceFunctorBase{
-  ReduceMeanFunctor(const std::vector<int> &axis,
+struct ReduceMeanFunctor : ReduceFunctorBase {
+  ReduceMeanFunctor(OpKernelContext *context,
+                    const std::vector<int> &axis,
                    const bool keep_dims)
-      : ReduceFunctorBase(axis, keep_dims) {}
+      : ReduceFunctorBase(context, axis, keep_dims) {}

  void Simplify(const Tensor *input) {
    std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
@@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{
 template <typename T>
 struct ReduceMeanFunctor<DeviceType::GPU, T>
    : ReduceFunctorBase {
-  ReduceMeanFunctor(const std::vector<int> axis,
+  ReduceMeanFunctor(OpKernelContext *context,
+                    const std::vector<int> axis,
                    const bool keep_dims)
-      : ReduceFunctorBase(axis, keep_dims) {}
+      : ReduceFunctorBase(context, axis, keep_dims) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output_tensor,

--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -19,17 +19,14 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
+#include "mace/kernels/kernel.h"

 namespace mace {
 namespace kernels {

 template <DeviceType D, typename T>
-struct ReshapeFunctor {
-  ReshapeFunctor() {}
+struct ReshapeFunctor : OpKernel {
+  explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {}

  MaceStatus operator()(const Tensor *input,
                  const std::vector<index_t> &out_shape,

--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
@@ -21,6 +21,7 @@

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
 #include "mace/utils/logging.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -136,10 +137,11 @@ inline void ResizeImage(const float *images,
  }
 }

-struct ResizeBicubicFunctorBase {
-  ResizeBicubicFunctorBase(const std::vector<index_t> &size,
-                            bool align_corners)
-      : align_corners_(align_corners) {
+struct ResizeBicubicFunctorBase : OpKernel {
+  ResizeBicubicFunctorBase(OpKernelContext *context,
+                           const std::vector<index_t> &size,
+                           bool align_corners)
+      : OpKernel(context), align_corners_(align_corners) {
    MACE_CHECK(size.size() == 2);
    out_height_ = size[0];
    out_width_ = size[1];
@@ -157,8 +159,10 @@ struct ResizeBicubicFunctor;
 template<>
 struct ResizeBicubicFunctor<DeviceType::CPU, float>
    : ResizeBicubicFunctorBase {
-  ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBicubicFunctorBase(size, align_corners) {}
+  ResizeBicubicFunctor(OpKernelContext *context,
+                       const std::vector<index_t> &size,
+                       bool align_corners)
+      : ResizeBicubicFunctorBase(context, size, align_corners) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -203,8 +207,10 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float>
 template<typename T>
 struct ResizeBicubicFunctor<DeviceType::GPU, T>
    : ResizeBicubicFunctorBase {
-  ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBicubicFunctorBase(size, align_corners) {}
+  ResizeBicubicFunctor(OpKernelContext *context,
+                       const std::vector<index_t> &size,
+                       bool align_corners)
+      : ResizeBicubicFunctorBase(context, size, align_corners) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
--- a/mace/kernels/scalar_math.h
+++ b/mace/kernels/scalar_math.h
--- a/mace/kernels/sgemm.h
+++ b/mace/kernels/sgemm.h
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
--- a/mace/kernels/split.h
+++ b/mace/kernels/split.h
--- a/mace/kernels/stack.h
+++ b/mace/kernels/stack.h
--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.h
--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
--- a/mace/kernels/unstack.h
+++ b/mace/kernels/unstack.h
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
--- a/mace/libmace/mace_runtime.cc
+++ b/mace/libmace/mace_runtime.cc
--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
--- a/mace/ops/argmax.h
+++ b/mace/ops/argmax.h
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
--- a/mace/ops/bias_add.h
+++ b/mace/ops/bias_add.h
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
--- a/mace/ops/cast.h
+++ b/mace/ops/cast.h
--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
--- a/mace/ops/crop.h
+++ b/mace/ops/crop.h
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
--- a/mace/ops/fill.h
+++ b/mace/ops/fill.h
--- a/mace/ops/folded_batch_norm.h
+++ b/mace/ops/folded_batch_norm.h
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
--- a/mace/ops/gather.h
+++ b/mace/ops/gather.h
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
--- a/mace/ops/identity.h
+++ b/mace/ops/identity.h
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
--- a/mace/ops/infer_conv2d_shape.h
+++ b/mace/ops/infer_conv2d_shape.h
--- a/mace/ops/local_response_norm.h
+++ b/mace/ops/local_response_norm.h
--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
--- a/mace/ops/lstmcell.h
+++ b/mace/ops/lstmcell.h
--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pad.h
+++ b/mace/ops/pad.h
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/ops/proposal.h
+++ b/mace/ops/proposal.h
--- a/mace/ops/proposal_test.cc
+++ b/mace/ops/proposal_test.cc
--- a/mace/ops/quantize.h
+++ b/mace/ops/quantize.h
--- a/mace/ops/reduce_mean.h
+++ b/mace/ops/reduce_mean.h
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
--- a/mace/ops/scalar_math.h
+++ b/mace/ops/scalar_math.h
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
--- a/mace/ops/shape.h
+++ b/mace/ops/shape.h
--- a/mace/ops/softmax.h
+++ b/mace/ops/softmax.h
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
--- a/mace/ops/split.h
+++ b/mace/ops/split.h
--- a/mace/ops/squeeze.h
+++ b/mace/ops/squeeze.h
--- a/mace/ops/stack.h
+++ b/mace/ops/stack.h
--- a/mace/ops/strided_slice.h
+++ b/mace/ops/strided_slice.h
--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
--- a/mace/ops/unstack.h
+++ b/mace/ops/unstack.h
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
--- a/mace/public/BUILD
+++ b/mace/public/BUILD
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
--- a/mace/test/mace_api_exception_test.cc
+++ b/mace/test/mace_api_exception_test.cc
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/tools/quantization/quantize_stat.cc
+++ b/mace/tools/quantization/quantize_stat.cc
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc