Merge branch 'refactor-mace' into 'master'

Ajusted the structure of MACE to support the mixing of CPU and GPU. See merge request !846

Merge branch 'refactor-mace' into 'master'
Ajusted the structure of MACE to support the mixing of CPU and GPU. See merge request !846
e4dc6f9e · 李滨 · de496865 · c0164bc5 · e4dc6f9e · e4dc6f9e
375 changed file
--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -5,107 +5,114 @@ You can create a custom op if it is not supported yet.

 To add a custom op, you need to follow these steps:

-Define the Op class
--------------------
-Define the new Op class in `mace/ops/my_custom_op.h`.
-
+Register the new OpDef information
+----------------------------------
+Register the OpDef information about which devices the operation could run on.
+Registry file is in `mace/ops/ops_def_register.cc`
 ```c++
-#ifndef MACE_OPS_MY_CUSTOM_OP_H_
-#define MACE_OPS_MY_CUSTOM_OP_H_
+#include "mace/ops/ops_def_register.h"
+
+namespace mace {
+namespace ops {
+
+void RegisterOpDefs(OpDefRegistryBase *op_def_registry) {
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("MyCustomOp")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+  ......
+}
+}  // namespace ops
+}  // namespace mace
+
+```

+Implement the Operation
+-----------------------
+The Best way is to refer to the implementation of other operator(e.g. `/mace/kernels/activation.cc`)
+
+Define the new Op class in `mace/kernels/my_custom_op.cc`.
+1. CPU code: just write the code in `mace/kernels/my_custom_op.cc`.
+2. GPU code: Kernel API is defined in `mace/kernels/my_custom_op.h`, 
+Kernel based on Image is realized in `mace/kernels/opencl/image/my_custom_op.cc`,
+Kernel based on Buffer is realized in `mace/kernels/opencl/buffer/my_custom_op.cc`.
+ 
+The structure like the following code.
+```c++
 #include "mace/core/operator.h"
-#include "mace/kernels/my_custom_op.h"

 namespace mace {
-namespace ops {
+namespace kernels {
+
+template <DeviceType D, class T>
+class MyCustomOp;

-template <DeviceType D, typename T>
-class MyCustomOp : public Operator<D, T> {
- public:
-  MyCustomOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_() {}
-
-  bool Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-   
-    functor_(input, output, future);
-    return true;
-  }
-
- protected:
-  OP_INPUT_TAGS(INPUT);
-  OP_OUTPUT_TAGS(OUTPUT);
-
- private:
-  kernels::MyCustomOpFunctor<D, T> functor_;
+template <>
+class MyCustomOp<DeviceType::CPU, float> : public Operation {
+...
+}
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class ActivationOp<DeviceType::GPU, T> : public Operation {
+...
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_MY_CUSTOM_OP_H_
-
 ```

-Register the new Op
--------------------
-Define the Ops registering function in `mace/ops/my_custom_op.cc`.
+Register the Operation
+-----------------------
+1, Add register function in `mace/kernels/my_custom_op.cc`
 ```c++
-#include "mace/ops/my_custom_op.h"
+#include "mace/core/operator.h"

 namespace mace {
-namespace ops {
+namespace kernels {

-void Register_My_Custom_Op(OperatorRegistryBase *op_registry) {
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    Custom_Op<DeviceType::CPU, float>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, float>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, half>);
-}
+void RegisterMyCustomOp(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+                   DeviceType::CPU, float);

+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 }  // namespace ops
 }  // namespace mace
-
 ```
-And then register the new Op in `mace/ops/ops_register.cc`.
+2, And then register the new Op in `mace/kernels/ops_register.cc`.
 ```
-#include "mace/ops/ops_register.h"
+#include "mace/kernels/ops_register.h"

 namespace mace {
-
 namespace ops {
 // Keep in lexicographical order

 ...

-extern void Register_My_Custom_Op(OperatorRegistryBase *op_registry);
+extern void RegisterMyCustomOp(OpRegistryBase *op_registry);

 ...

 }  // namespace ops


-OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {
+OpRegistry::OpRegistry() : OpRegistryBase() {
  // Keep in lexicographical order

  ...

-  ops::Register_My_Custom_Op(this);
+  ops::RegisterMyCustomOp(this);

  ...

@@ -113,16 +120,13 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {

 }  // namespace mace
 ```
+Add UTs
+----------------------
+Add operation unit tests in `mace/ops/my_custom_op_test.cc`

-Implement the Op kernel code
----------------------------
-You need to implement the CPU kernel in a `mace/kernels/my_custom_op.h` and
-optionally OpenCL kernel in `mace/kernels/kernels/my_custom_op_opencl.cc` and
-`mace/kernels/kernels/cl/my_custom_op.cl`. You can also optimize the CPU
-kernel with NEON.
-
-Add test and benchmark
+Add benchmark
 ----------------------
+Add operation benchmark in `mace/ops/my_custom_op_benchmark.cc`
 It's strongly recommended to add unit tests and micro benchmarks for your
 new Op. If you wish to contribute back, it's required.


--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -263,7 +263,7 @@ int Main(int argc, char **argv) {
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
      true);
-  if (mace_status != MACE_SUCCESS) {
+  if (mace_status != MaceStatus::MACE_SUCCESS) {
    LOG(INFO) << "Set openmp or cpu affinity failed.";
  }
 #ifdef MACE_ENABLE_OPENCL

--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -50,6 +50,7 @@ cc_library(
    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
    deps = [
        "//mace/public",
+        "//mace/utils",
    ],
 )


--- a/mace/core/future.h
+++ b/mace/core/future.h
@@ -27,7 +27,12 @@ struct CallStats;

 // Wait the call to finish and get the stats if param is not nullptr
 struct StatsFuture {
-  std::function<void(CallStats *)> wait_fn;
+  std::function<void(CallStats *)> wait_fn = [](CallStats *stats) {
+    if (stats != nullptr) {
+      stats->start_micros = NowMicros();
+      stats->end_micros = stats->start_micros;
+    }
+  };
 };

 inline void SetFutureDefaultWaitFn(StatsFuture *future) {

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -16,8 +16,10 @@
 #include <algorithm>
 #include <limits>

+#include "mace/core/future.h"
 #include "mace/core/macros.h"
 #include "mace/core/net.h"
+#include "mace/core/op_context.h"
 #include "mace/public/mace.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"
@@ -25,39 +27,60 @@

 namespace mace {

-NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
-                 const std::shared_ptr<const NetDef> net_def,
-                 Workspace *ws,
-                 Device *device)
-    : op_registry_(op_registry) {
-  MACE_UNUSED(net_def);
-  MACE_UNUSED(ws);
-  MACE_UNUSED(device);
-}
-
-SerialNet::SerialNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const std::shared_ptr<const NetDef> net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode)
-    : NetBase(op_registry, net_def, ws, device), device_(device),
-      op_kernel_context_(new OpKernelContext(ws, device)) {
+SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
+                     const OpRegistryBase *op_registry,
+                     const NetDef *net_def,
+                     Workspace *ws,
+                     Device *target_device,
+                     const NetMode mode)
+    : NetBase(),
+      ws_(ws),
+      target_device_(target_device),
+      cpu_device_(
+          new CPUDevice(target_device->cpu_runtime()->num_threads(),
+                        target_device->cpu_runtime()->policy(),
+                        target_device->cpu_runtime()->use_gemmlowp())) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  DeviceType device_type = device->device_type();
+  // Register Operations
+  MaceStatus status;
+  for (int idx = 0; idx < net_def->op_types_size(); ++idx) {
+    status = op_def_registry->Register(net_def->op_types(idx));
+    MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
+  }
+  // Create Operations
+  operators_.clear();
+  const OpRegistrationInfo *info;
+  DeviceType target_device_type = target_device_->device_type();
+  OpConstructContext construct_context(ws_);
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
-    // TODO(liuqi): refactor to add device_type to OperatorDef
+    // Create the Operation
    const int op_device =
        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(device_type));
-    if (op_device == device_type) {
-      VLOG(3) << "Creating operator " << operator_def.name() << "("
-              << operator_def.type() << ")";
+            operator_def, "device", static_cast<int>(target_device_type));
+    if (op_device == target_device_type) {
+      // Find op registration information
+      status = op_def_registry->Find(operator_def.type(), &info);
+      MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
+      // Get available devices (sorted based on priority)
      OperatorDef temp_def(operator_def);
-      std::unique_ptr<OperatorBase> op(
-          op_registry->CreateOperator(temp_def, op_kernel_context_.get(),
-                                      device_type, mode));
+      auto available_devices = info->device_place_func_();
+      // Find the device type to run the op.
+      // If the target_device_type in available devices, use target_device_type,
+      // otherwise, fallback to the first device (top priority).
+      DeviceType device_type = available_devices[0];
+      construct_context.set_device(cpu_device_);
+      for (auto device : available_devices) {
+        if (device == target_device_type) {
+          device_type = target_device_type;
+          construct_context.set_device(target_device_);
+          break;
+        }
+      }
+      temp_def.set_device_type(device_type);
+      construct_context.set_operator_def(&temp_def);
+      std::unique_ptr<Operation> op(
+          op_registry->CreateOperation(&construct_context, device_type, mode));
      if (op) {
        operators_.emplace_back(std::move(op));
      }
@@ -65,38 +88,59 @@ SerialNet::SerialNet(
  }
 }

+MaceStatus SerialNet::Init() {
+  // TODO(liuqi): where to do memory reuse.
+  MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
+  OpInitContext init_context(ws_);
+  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
+    auto &op = *iter;
+    DeviceType device_type = op->device_type();
+    if (device_type == target_device_->device_type()) {
+      init_context.set_device(target_device_);
+    } else {
+      init_context.set_device(cpu_device_);
+    }
+    // Initialize the operation
+    MACE_RETURN_IF_ERROR(op->Init(&init_context));
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
+  // TODO(liuqi): In/Out Buffer Transform
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
-  const DeviceType device_type = device_->device_type();
+  OpContext context(ws_, cpu_device_);
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
-    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
-                        op->debug_def().type(), "), mem_id: ",
+    DeviceType device_type = op->device_type();
+    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
+                        "<", device_type, ", ", op->debug_def().type(), ">",
+                        ". mem_id: ",
                        MakeListString(op->debug_def().mem_id().data(),
                                       op->debug_def().mem_id().size()));
-    bool future_wait = (device_type == DeviceType::GPU &&
-                        (run_metadata != nullptr ||
-                         std::distance(iter, operators_.end()) == 1));
+    if (device_type == target_device_->device_type()) {
+      context.set_device(target_device_);
+    } else {
+      context.set_device(cpu_device_);
+    }

    CallStats call_stats;
-    if (future_wait) {
-      StatsFuture future;
-      MACE_RETURN_IF_ERROR(op->Run(&future));
-      if (run_metadata != nullptr) {
+    if (run_metadata == nullptr) {
+      MACE_RETURN_IF_ERROR(op->Run(&context));
+    } else {
+      if (device_type == DeviceType::CPU) {
+        call_stats.start_micros = NowMicros();
+        MACE_RETURN_IF_ERROR(op->Run(&context));
+        call_stats.end_micros = NowMicros();
+      } else if (device_type == DeviceType::GPU) {
+        StatsFuture future;
+        context.set_future(&future);
+        MACE_RETURN_IF_ERROR(op->Run(&context));
        future.wait_fn(&call_stats);
-      } else {
-        future.wait_fn(nullptr);
      }
-    } else if (run_metadata != nullptr) {
-      call_stats.start_micros = NowMicros();
-      MACE_RETURN_IF_ERROR(op->Run(nullptr));
-      call_stats.end_micros = NowMicros();
-    } else {
-      MACE_RETURN_IF_ERROR(op->Run(nullptr));
-    }

-    if (run_metadata != nullptr) {
+      // Record run metadata
      std::vector<int> strides;
      int padding_type = -1;
      std::vector<int> paddings;
@@ -150,19 +194,20 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
                      << "@@" << min_v << "," << max_v;
          }
        } else {
+          const int bin_size = 2048;
          for (int ind = 0; ind < op->debug_def().quantize_info_size(); ++ind) {
            float min_v = op->debug_def().quantize_info(ind).minval();
            float max_v = op->debug_def().quantize_info(ind).maxval();
-            std::vector<int> bin_distribution(kBinSize, 0);
-            float bin_v = (max_v - min_v) / kBinSize;
+            std::vector<int> bin_distribution(bin_size, 0);
+            float bin_v = (max_v - min_v) / bin_size;
            Tensor::MappingGuard guard(op->Output(i));
            const float *output_data = op->Output(i)->data<float>();
            for (index_t j = 0; j < op->Output(i)->size(); ++j) {
                int ind = static_cast<int>((output_data[j] - min_v) / bin_v);
                if (ind < 0)
                  ind = 0;
-                else if (ind > kBinSize-1)
-                  ind = kBinSize-1;
+                else if (ind > bin_size-1)
+                  ind = bin_size-1;
                bin_distribution[ind]++;
            }
            LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
@@ -174,28 +219,6 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
    }
  }

-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
-
-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const NetDef &net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode) {
-  std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
-  return CreateNet(op_registry, tmp_net_def, ws, device, mode);
-}
-
-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const std::shared_ptr<const NetDef> net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode) {
-  std::unique_ptr<NetBase> net(
-      new SerialNet(op_registry, net_def, ws, device, mode));
-  return net;
-}
-
 }  // namespace mace
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -21,64 +21,51 @@
 #include <unordered_map>
 #include <sstream>

-#include "mace/core/operator.h"
-#include "mace/utils/string_util.h"
+#include "mace/core/op_def_registry.h"

-#define kBinSize 2048
+#include "mace/core/operator.h"

 namespace mace {

 class RunMetadata;
-class OperatorBase;
 class Workspace;

 class NetBase {
 public:
-  NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
-          const std::shared_ptr<const NetDef> net_def,
-          Workspace *ws,
-          Device *device);
-  virtual ~NetBase() noexcept {}
+  NetBase() noexcept = default;
+  virtual ~NetBase() = default;
+
+  virtual MaceStatus Init() = 0;

  virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;

 protected:
-  const std::shared_ptr<const OperatorRegistryBase> op_registry_;
-
  MACE_DISABLE_COPY_AND_ASSIGN(NetBase);
 };

 class SerialNet : public NetBase {
 public:
-  SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry,
-            const std::shared_ptr<const NetDef> net_def,
+  SerialNet(OpDefRegistryBase *op_def_registry,
+            const OpRegistryBase *op_registry,
+            const NetDef *net_def,
            Workspace *ws,
-            Device *device,
+            Device *target_device,
            const NetMode mode = NetMode::NORMAL);

+  MaceStatus Init() override;
+
  MaceStatus Run(RunMetadata *run_metadata = nullptr) override;

 protected:
-  std::vector<std::unique_ptr<OperatorBase> > operators_;
-  Device *device_;
-  std::unique_ptr<OpKernelContext> op_kernel_context_;
+  Workspace *ws_;
+  Device *target_device_;
+  // CPU is base device.
+  Device *cpu_device_;
+  std::vector<std::unique_ptr<Operation> > operators_;

  MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
 };

-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const NetDef &net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode = NetMode::NORMAL);
-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const std::shared_ptr<const NetDef> net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode = NetMode::NORMAL);
-
 }  // namespace mace

 #endif  // MACE_CORE_NET_H_
--- a/mace/core/op_kernel_context.cc
+++ b/mace/core/op_kernel_context.cc
@@ -12,21 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_kernel_context.h"
+#include "mace/core/op_context.h"

 namespace mace {

-OpKernelContext::OpKernelContext(Workspace *ws, Device *device)
-    : device_(device), ws_(ws) {}
+OpContext::OpContext(Workspace *ws, Device *device)
+    : device_(device), ws_(ws), future_(nullptr) {}

-OpKernelContext::~OpKernelContext() = default;
+OpContext::~OpContext() = default;

-Device* OpKernelContext::device() {
+void OpContext::set_device(Device *device) {
+  device_ = device;
+}
+
+Device* OpContext::device() {
  return device_;
 }

-Workspace* OpKernelContext::workspace() {
+Workspace* OpContext::workspace() {
  return ws_;
 }

+void OpContext::set_future(StatsFuture *future) {
+  future_ = future;
+}
+
+StatsFuture *OpContext::future() {
+  return future_;
+}
+
 }  // namespace mace
--- a/mace/core/op_kernel_context.h
+++ b/mace/core/op_kernel_context.h
@@ -12,23 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_
-#define MACE_CORE_OP_KERNEL_CONTEXT_H_
+#ifndef MACE_CORE_OP_CONTEXT_H_
+#define MACE_CORE_OP_CONTEXT_H_

 #include "mace/core/device.h"
 #include "mace/core/workspace.h"
+#include "mace/core/future.h"
+
 namespace mace {

-class OpKernelContext {
+class OpContext {
 public:
-  OpKernelContext(Workspace *ws, Device *device);
-  ~OpKernelContext();
+  OpContext(Workspace *ws, Device *device);
+  ~OpContext();
+  void set_device(Device *device);
  Device *device();
  Workspace *workspace();
+
+  void set_future(StatsFuture *future);
+  StatsFuture *future();
 private:
  Device *device_;
  Workspace *ws_;
+  StatsFuture *future_;
+  // metadata
 };

 }  // namespace mace
-#endif  // MACE_CORE_OP_KERNEL_CONTEXT_H_
+#endif  // MACE_CORE_OP_CONTEXT_H_
--- a/mace/core/op_def_registry.cc
+++ b/mace/core/op_def_registry.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/op_def_registry.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+void AddOpRegistrar(OpDefRegistryBase *registry,
+                    const OpRegistrationBuilder &builder) {
+  registry->AddRegistrar(
+      builder.name(),
+      [builder](OpRegistrationInfo *info){
+        builder.Finalize(info);
+      });
+}
+
+OpRegistrationBuilder::OpRegistrationBuilder(const std::string name)
+    : name_(name) {}
+
+const std::string OpRegistrationBuilder::name() const { return name_; }
+
+OpRegistrationBuilder &OpRegistrationBuilder::SetDevicePlaceFunc(
+    std::vector<DeviceType> (*func)()) {
+  info_.device_place_func_ = func;
+  return *this;
+}
+
+void OpRegistrationBuilder::Finalize(OpRegistrationInfo *info) const {
+  *info = info_;
+}
+
+void OpDefRegistryBase::AddRegistrar(const std::string name,
+                                    const OpRegistrar &registrar) {
+  registrar_.emplace(name, registrar);
+}
+
+MaceStatus OpDefRegistryBase::Register(const std::string &name) {
+  VLOG(3) << "Registering operation definition: " << name;
+  if (registry_.find(name) != registry_.end()) {
+    return MaceStatus::MACE_SUCCESS;
+  }
+  auto iter = registrar_.find(name);
+  if (iter == registrar_.end()) {
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "MACE do not support the operation: " + name);
+  }
+  registry_.emplace(
+      name, std::unique_ptr<OpRegistrationInfo>(new OpRegistrationInfo()));
+  iter->second(registry_[name].get());
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus OpDefRegistryBase::Find(const std::string &name,
+                                  const OpRegistrationInfo **info) {
+  auto iter = registry_.find(name);
+  if (iter == registry_.end()) {
+    *info = nullptr;
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "Mace do not support the operation: " + name);
+  }
+  *info = iter->second.get();
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace mace
--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -12,47 +12,70 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
-#define MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
+#ifndef MACE_CORE_OP_DEF_REGISTRY_H_
+#define MACE_CORE_OP_DEF_REGISTRY_H_

+#include <functional>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

-#include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/winograd_transform.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"

 namespace mace {
-namespace ops {

-template <DeviceType D, typename T>
-class WinogradInverseTransformOp : public Operator<D, T> {
+// Device placement function
+typedef std::function<std::vector<DeviceType>()> DevicePlaceFunc;
+
+struct OpRegistrationInfo {
+  OpRegistrationInfo() = default;
+  explicit OpRegistrationInfo(const DevicePlaceFunc &func)
+      : device_place_func_(func) {}
+
+  DevicePlaceFunc device_place_func_;
+};
+
+class OpRegistrationBuilder {
 public:
-  WinogradInverseTransformOp(const OperatorDef &op_def,
-                             OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
-                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const std::vector<const Tensor *> &inputs = this->Inputs();
-    Tensor *output_tensor = this->Output(OUTPUT);
-    return functor_(inputs, output_tensor, future);
-  }
+  explicit OpRegistrationBuilder(const std::string name);
+
+  const std::string name() const;
+
+  OpRegistrationBuilder &SetDevicePlaceFunc(
+      std::vector<DeviceType> (*func)());

+  void Finalize(OpRegistrationInfo *info) const;
 private:
-  kernels::WinogradInverseTransformFunctor<D, T> functor_;
+  std::string name_;
+  OpRegistrationInfo info_;
+};
+
+class OpDefRegistryBase {
+ public:
+  typedef std::function<void(OpRegistrationInfo *)> OpRegistrar;
+  OpDefRegistryBase() = default;
+  virtual ~OpDefRegistryBase() = default;
+  void AddRegistrar(const std::string name, const OpRegistrar &registrar);
+  MaceStatus Register(const std::string &name);
+  MaceStatus Find(const std::string &name, const OpRegistrationInfo **info);

- protected:
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
+ private:
+  std::unordered_map<std::string, OpRegistrar> registrar_;
+  std::unordered_map<
+      std::string,
+      std::unique_ptr<OpRegistrationInfo>> registry_;
+  MACE_DISABLE_COPY_AND_ASSIGN(OpDefRegistryBase);
 };

-}  // namespace ops
+void AddOpRegistrar(OpDefRegistryBase *registry,
+                    const OpRegistrationBuilder &builder);
+
+#define MACE_REGISTER_OP_DEF(op_def_registry, builder) \
+  AddOpRegistrar(op_def_registry, builder)
+
 }  // namespace mace

-#endif  // MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
+#endif  // MACE_CORE_OP_DEF_REGISTRY_H_
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -14,18 +14,69 @@

 #include <sstream>
 #include <memory>
-#include <string>
 #include <vector>

 #include "mace/core/operator.h"
-#include "mace/core/op_kernel_context.h"

 namespace mace {

-OperatorBase::OperatorBase(const OperatorDef &operator_def,
-                           OpKernelContext *context)
-    : operator_def_(std::make_shared<OperatorDef>(operator_def)) {
-  MACE_UNUSED(context);
+OpConstructContext::OpConstructContext(Workspace *ws)
+    : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
+OpConstructContext::OpConstructContext(OperatorDef *operator_def,
+                                       Workspace *ws,
+                                       Device *device)
+    : operator_def_(operator_def), ws_(ws), device_(device) {}
+
+OpInitContext::OpInitContext(Workspace *ws, Device *device)
+    : ws_(ws), device_(device) {}
+
+Operation::Operation(OpConstructContext *context)
+    : operator_def_(std::make_shared<OperatorDef>(*(context->operator_def())))
+{}
+
+MaceStatus Operation::Init(OpInitContext *context) {
+  Workspace *ws = context->workspace();
+  for (const std::string &input_str : operator_def_->input()) {
+    const Tensor *tensor = ws->GetTensor(input_str);
+    MACE_CHECK(tensor != nullptr, "op ", operator_def_->type(),
+               ": Encountered a non-existing input tensor: ", input_str);
+    inputs_.push_back(tensor);
+  }
+  // TODO(liuqi): filter transform
+  for (int i = 0; i < operator_def_->output_size(); ++i) {
+    const std::string output_str = operator_def_->output(i);
+    if (ws->HasTensor(output_str)) {
+      // TODO(liuqi): Workspace should pre-allocate all of the output tensors
+      outputs_.push_back(ws->GetTensor(output_str));
+    } else {
+      MACE_CHECK(
+          operator_def_->output_type_size() == 0 ||
+              operator_def_->output_size() == operator_def_->output_type_size(),
+          "operator output size != operator output type size",
+          operator_def_->output_size(),
+          operator_def_->output_type_size());
+      DataType output_type;
+      if (i < operator_def_->output_type_size()) {
+        output_type = operator_def_->output_type(i);
+      } else {
+        output_type = static_cast<DataType>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DT_FLOAT)));
+      }
+      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
+          output_str, context->device()->allocator(), output_type)));
+
+      if (i < operator_def_->output_shape_size()) {
+        std::vector<index_t>
+            shape_configured(operator_def_->output_shape(i).dims_size());
+        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+          shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
+        }
+        ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+      }
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
 }

 OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
@@ -36,7 +87,7 @@ OpKeyBuilder &OpKeyBuilder::Device(DeviceType device) {
 }

 OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
-                                           const DataType allowed) {
+                                           DataType allowed) {
  type_constraint_[attr_name] = allowed;
  return *this;
 }
@@ -53,27 +104,28 @@ const std::string OpKeyBuilder::Build() {
  return ss.str();
 }

-OperatorRegistryBase::~OperatorRegistryBase() {}
+OpRegistryBase::~OpRegistryBase() = default;

-std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
-    const OperatorDef &operator_def,
-    OpKernelContext *context,
-    DeviceType type,
+std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
+    OpConstructContext *context,
+    DeviceType device_type,
    const NetMode mode) const {
+  OperatorDef *operator_def = context->operator_def();
  const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      operator_def, "T", static_cast<int>(DT_FLOAT));
+      *operator_def, "T", static_cast<int>(DT_FLOAT));
  const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      operator_def, "mode", static_cast<int>(NetMode::NORMAL));
+      *operator_def, "mode", static_cast<int>(NetMode::NORMAL));
  const NetMode op_mode = static_cast<NetMode>(op_mode_i);
-  VLOG(3) << "Creating operator " << operator_def.name() << "("
-          << operator_def.type() << "<" << dtype << ">" << ")";
+  VLOG(3) << "Creating operator " << operator_def->name() << "("
+          << operator_def->type() << "<" << dtype << ">" << ") on "
+          << device_type;
  if (op_mode == mode) {
    return registry_.Create(
-        OpKeyBuilder(operator_def.type().data())
-            .Device(type)
+        OpKeyBuilder(operator_def->type().data())
+            .Device(device_type)
            .TypeConstraint("T", static_cast<DataType>(dtype))
            .Build(),
-        operator_def, context);
+        context);
  } else {
    return nullptr;
  }

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -21,8 +21,7 @@
 #include <map>

 #include "mace/core/arg_helper.h"
-#include "mace/core/future.h"
-#include "mace/core/op_kernel_context.h"
+#include "mace/core/op_context.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
@@ -30,10 +29,66 @@

 namespace mace {

-class OperatorBase {
+// memory_optimizer, device
+class OpConstructContext {
 public:
-  explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *);
-  virtual ~OperatorBase() noexcept {}
+  explicit OpConstructContext(Workspace *ws);
+  OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
+  ~OpConstructContext() = default;
+
+  inline void set_operator_def(OperatorDef *operator_def) {
+    operator_def_ = operator_def;
+  }
+
+  inline OperatorDef *operator_def() const {
+    return operator_def_;
+  }
+
+  inline Workspace *workspace() const {
+    return ws_;
+  }
+
+  inline void set_device(Device* device) {
+    device_ = device;
+  }
+
+  inline Device *device() const {
+    return device_;
+  }
+
+ private:
+  OperatorDef *operator_def_;
+  Workspace *ws_;
+  Device *device_;
+};
+
+// memory_optimizer, device
+class OpInitContext {
+ public:
+  explicit OpInitContext(Workspace *ws, Device *device = nullptr);
+  ~OpInitContext() = default;
+
+  inline Workspace *workspace() const {
+    return ws_;
+  }
+
+  inline void set_device(Device *device) {
+    device_ = device;
+  }
+
+  inline Device *device() const {
+    return device_;
+  }
+
+ private:
+  Workspace *ws_;
+  Device *device_;
+};
+
+class Operation {
+ public:
+  explicit Operation(OpConstructContext *context);
+  virtual ~Operation() = default;

  template <typename T>
  inline T GetOptionalArg(const std::string &name,
@@ -50,6 +105,10 @@ class OperatorBase {
        *operator_def_, name, default_value);
  }

+  inline DeviceType device_type() const {
+    return static_cast<DeviceType>(operator_def_->device_type());
+  }
+
  inline const Tensor *Input(unsigned int idx) {
    MACE_CHECK(idx < inputs_.size());
    return inputs_[idx];
@@ -63,7 +122,8 @@ class OperatorBase {
  inline const std::vector<Tensor *> &Outputs() { return outputs_; }

  // Run Op asynchronously (depends on device), return a future if not nullptr.
-  virtual MaceStatus Run(StatsFuture *future) = 0;
+  virtual MaceStatus Init(OpInitContext *);
+  virtual MaceStatus Run(OpContext *) = 0;

  inline const OperatorDef &debug_def() const {
    MACE_CHECK(has_debug_def(), "operator_def was null!");
@@ -82,55 +142,7 @@ class OperatorBase {
  std::vector<const Tensor *> inputs_;
  std::vector<Tensor *> outputs_;

-  MACE_DISABLE_COPY_AND_ASSIGN(OperatorBase);
-};
-
-template <DeviceType D, class T>
-class Operator : public OperatorBase {
- public:
-  explicit Operator(const OperatorDef &operator_def, OpKernelContext *context)
-      : OperatorBase(operator_def, context) {
-    Workspace *ws = context->workspace();
-    for (const std::string &input_str : operator_def.input()) {
-      const Tensor *tensor = ws->GetTensor(input_str);
-      MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
-                 ": Encountered a non-existing input tensor: ", input_str);
-      inputs_.push_back(tensor);
-    }
-
-    for (int i = 0; i < operator_def.output_size(); ++i) {
-      const std::string output_str = operator_def.output(i);
-      if (ws->HasTensor(output_str)) {
-        outputs_.push_back(ws->GetTensor(output_str));
-      } else {
-        MACE_CHECK(
-          operator_def.output_type_size() == 0
-          || operator_def.output_size() == operator_def.output_type_size(),
-          "operator output size != operator output type size",
-          operator_def.output_size(),
-          operator_def.output_type_size());
-        DataType output_type;
-        if (i < operator_def.output_type_size()) {
-          output_type = operator_def.output_type(i);
-        } else {
-          output_type = DataTypeToEnum<T>::v();
-        }
-        outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
-          output_str, context->device()->allocator(), output_type)));
-
-        if (i < operator_def.output_shape_size()) {
-          std::vector<index_t>
-              shape_configured(operator_def.output_shape(i).dims_size());
-          for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-            shape_configured[dim] = operator_def.output_shape(i).dims(dim);
-          }
-          ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
-        }
-      }
-    }
-  }
-  MaceStatus Run(StatsFuture *future) override = 0;
-  ~Operator() noexcept override {}
+  MACE_DISABLE_COPY_AND_ASSIGN(Operation);
 };

 // MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
@@ -154,7 +166,8 @@ class OpKeyBuilder {

  OpKeyBuilder &Device(DeviceType device);

-  OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed);
+  OpKeyBuilder &TypeConstraint(const char *attr_name,
+                               DataType allowed);

  template <typename T>
  OpKeyBuilder &TypeConstraint(const char *attr_name);
@@ -172,33 +185,37 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
  return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
 }

-class OperatorRegistryBase {
+class OpRegistryBase {
 public:
  typedef Registry<std::string,
-                   OperatorBase,
-                   const OperatorDef &,
-                   OpKernelContext *>
+                   Operation,
+                   OpConstructContext *>
      RegistryType;
-  OperatorRegistryBase() = default;
-  virtual ~OperatorRegistryBase();
+  OpRegistryBase() = default;
+  virtual ~OpRegistryBase();
  RegistryType *registry() { return &registry_; }
-  std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
-                                               OpKernelContext *context,
-                                               DeviceType type,
-                                               const NetMode mode) const;
+  std::unique_ptr<Operation> CreateOperation(
+      OpConstructContext *context,
+      DeviceType device_type,
+      const NetMode mode) const;

 private:
  RegistryType registry_;
-  MACE_DISABLE_COPY_AND_ASSIGN(OperatorRegistryBase);
+  MACE_DISABLE_COPY_AND_ASSIGN(OpRegistryBase);
 };

 MACE_DECLARE_REGISTRY(OpRegistry,
-                      OperatorBase,
-                      const OperatorDef &,
-                      OpKernelContext *);
-
-#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
-  MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
+                      Operation,
+                      OpConstructContext *);
+
+#define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
+  MACE_REGISTER_CLASS(OpRegistry,                                      \
+                      op_registry->registry(),                         \
+                      OpKeyBuilder(op_type)                            \
+                        .Device(device)                                \
+                        .TypeConstraint<dt>("T")                       \
+                        .Build(),                                      \
+                      class_name<device, dt>)

 }  // namespace mace


--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,12 +18,13 @@
 #include <omp.h>
 #endif

-#include <errno.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
-#include <string.h>
 #include <algorithm>
+#include <cerrno>
+#include <cstring>
+#include <string>
 #include <utility>
 #include <vector>

@@ -85,9 +86,10 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
  int err = sched_setaffinity(pid, sizeof(mask), &mask);
  if (err) {
    LOG(WARNING) << "set affinity error: " << strerror(errno);
-    return MACE_INVALID_ARGS;
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "set affinity error: " + std::string(strerror(errno)));
  } else {
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
 }

@@ -104,7 +106,9 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
    if (cpu_max_freq[i] == 0) {
      LOG(WARNING) << "Cannot get CPU" << i
                   << "'s max frequency info, maybe it is offline.";
-      return MACE_INVALID_ARGS;
+      return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                        "Cannot get CPU's max frequency info,"
+                        " maybe it is offline.");
    }
  }

@@ -124,7 +128,7 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
    }
  }

-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }

 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
@@ -147,7 +151,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
    CPU_SET(cpu_id, &mask);
  }
 #ifdef MACE_ENABLE_OPENMP
-  std::vector<MaceStatus> status(omp_num_threads);
+  std::vector<MaceStatus> status(omp_num_threads,
+                                 MaceStatus::MACE_INVALID_ARGS);
 #pragma omp parallel for
  for (int i = 0; i < omp_num_threads; ++i) {
    VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
@@ -155,10 +160,10 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
    status[i] = SetThreadAffinity(mask);
  }
  for (int i = 0; i < omp_num_threads; ++i) {
-    if (status[i] != MACE_SUCCESS)
-      return MACE_INVALID_ARGS;
+    if (status[i] != MaceStatus::MACE_SUCCESS)
+      return MaceStatus::MACE_INVALID_ARGS;
  }
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 #else
  MaceStatus status = SetThreadAffinity(mask);
  VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0];
@@ -183,13 +188,13 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
 #else
    LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }

  std::vector<int> big_core_ids;
  std::vector<int> little_core_ids;
  MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
-  if (res != MACE_SUCCESS) {
+  if (res != MaceStatus::MACE_SUCCESS) {
    return res;
  }


--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -55,6 +55,14 @@ class CPURuntime {
    return num_threads_;
  }

+  CPUAffinityPolicy policy() const {
+    return policy_;
+  }
+
+  bool use_gemmlowp() const {
+    return gemm_context_ != nullptr;
+  }
+
 private:
  MaceStatus SetOpenMPThreadsAndAffinityPolicy(
      int omp_num_threads_hint,

--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -38,7 +38,7 @@ OpenCLRuntime* GPUDevice::opencl_runtime() {
  return runtime_.get();
 }

-Allocator* GPUDevice::allocator() {
+Allocator *GPUDevice::allocator() {
  return allocator_.get();
 }


--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
@@ -27,6 +27,7 @@

 #include "src/main/cpp/include/mace/public/mace.h"
 #include "src/main/cpp/include/mace/public/mace_engine_factory.h"
+#include "mace/public/mace.h"

 namespace {

@@ -112,11 +113,12 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
      omp_num_threads,
      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy),
      true);
-  if (status != mace::MACE_SUCCESS) {
+  if (status != mace::MaceStatus::MACE_SUCCESS) {
    __android_log_print(ANDROID_LOG_ERROR,
                        "image_classify attrs",
-                        "openmp result: %d, threads: %d, cpu: %d",
-                        status, omp_num_threads, cpu_affinity_policy);
+                        "openmp result: %s, threads: %d, cpu: %d",
+                        status.information().c_str(), omp_num_threads,
+                        cpu_affinity_policy);
  }
  if (mace_context.device_type == mace::DeviceType::GPU) {
    config.SetGPUContext(mace_context.gpu_context);
@@ -163,8 +165,8 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(

  __android_log_print(ANDROID_LOG_INFO,
                      "image_classify attrs",
-                      "create result: %d",
-                      create_engine_status);
+                      "create result: %s",
+                      create_engine_status.information().c_str());

  return create_engine_status == mace::MaceStatus::MACE_SUCCESS ?
         JNI_OK : JNI_ERR;

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -170,7 +170,7 @@ bool RunModel(const std::vector<std::string> &input_names,
  status = config.SetCPUThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (status != MACE_SUCCESS) {
+  if (status != MaceStatus::MACE_SUCCESS) {
    std::cerr << "Set openmp or cpu affinity failed." << std::endl;
  }
 #ifdef MACE_ENABLE_OPENCL

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -28,12 +28,20 @@ cc_library(
            "*_test.cc",
            "*_benchmark.cc",
            "arm/*_test.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
        ],
    ) + if_opencl_enabled(glob(
        [
            "opencl/*.cc",
            "opencl/image/*.cc",
            "opencl/buffer/*.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
        ],
        exclude = [
            "opencl/*_test.cc",
@@ -44,18 +52,10 @@ cc_library(
            "*.h",
            "arm/*.h",
        ],
-        exclude = [
-            "buffer_transform.h",
-            "buffer_inverse_transform.h",
-            "lstmcell.h",
-        ],
    ) + if_opencl_enabled(glob([
        "opencl/*.h",
        "opencl/image/*.h",
        "opencl/buffer/*.h",
-        "buffer_transform.h",
-        "buffer_inverse_transform.h",
-        "lstmcell.h",
    ])),
    copts = [
        "-Werror",
@@ -77,7 +77,6 @@ cc_library(
    linkopts = if_android(["-lm"]),
    deps = [
        "//mace/core",
-        "//mace/utils",
        "@gemmlowp",
        "@tflite",
    ],

--- a/mace/kernels/activation.cc
+++ b/mace/kernels/activation.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/activation.h"
+
+#include <memory>
+
+#include "mace/core/operator.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/activation.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, class T>
+class ActivationOp;
+
+template <>
+class ActivationOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit ActivationOp(OpConstructContext *context)
+      : Operation(context),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
+                                                         0.0f)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();
+    if (activation_ == PRELU) {
+      MACE_CHECK(this->InputSize() > 1);
+      const Tensor *alpha = this->Input(1);
+      const float *alpha_ptr = alpha->data<float>();
+      const index_t outer_size = output->dim(0);
+      const index_t inner_size = output->dim(2) * output->dim(3);
+      PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size,
+                      alpha_ptr, output_ptr);
+    } else {
+      DoActivation(input_ptr, output_ptr, output->size(), activation_,
+                   relux_max_limit_);
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  ActivationType activation_;
+  float relux_max_limit_;
+};
+
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class ActivationOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit ActivationOp(OpConstructContext *context)
+      : Operation(context) {
+    ActivationType type = kernels::StringToActivationType(
+        Operation::GetOptionalArg<std::string>("activation",
+                                              "NOOP"));
+    auto relux_max_limit = static_cast<T>(
+        Operation::GetOptionalArg<float>("max_limit", 0.0f));
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(
+          new opencl::image::ActivationKernel<T>(type, relux_max_limit));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *alpha = this->InputSize() > 1 ? this->Input(1) : nullptr;
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    return kernel_->Compute(context, input, alpha, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLActivationKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterActivation(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -17,15 +17,11 @@

 #include <algorithm>
 #include <cmath>
-#include <memory>
 #include <string>
-#include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/kernels/kernel.h"
 #include "mace/kernels/arm/activation_neon.h"
+#include "mace/utils/logging.h"

 namespace mace {
 namespace kernels {
@@ -153,73 +149,6 @@ void PReLUActivation(const T *input_ptr,
  }
 }

-template <DeviceType D, typename T>
-class ActivationFunctor;
-
-template <>
-class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
- public:
-  ActivationFunctor(OpKernelContext *context,
-                    ActivationType type,
-                    float relux_max_limit)
-      : OpKernel(context),
-        activation_(type),
-        relux_max_limit_(relux_max_limit) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *alpha,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    const float *input_ptr = input->data<float>();
-    float *output_ptr = output->mutable_data<float>();
-    if (activation_ == PRELU) {
-      MACE_CHECK_NOTNULL(alpha);
-      const float *alpha_ptr = alpha->data<float>();
-      const index_t outer_size = output->dim(0);
-      const index_t inner_size = output->dim(2) * output->dim(3);
-      PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size,
-                      alpha_ptr, output_ptr);
-    } else {
-      DoActivation(input_ptr, output_ptr, output->size(), activation_,
-                   relux_max_limit_);
-    }
-    return MACE_SUCCESS;
-  }
-
- private:
-  ActivationType activation_;
-  float relux_max_limit_;
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLActivationKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *alpha,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel);
-};
-template <typename T>
-class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
- public:
-  ActivationFunctor(OpKernelContext *context,
-                    ActivationType type,
-                    T relux_max_limit);
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *alpha,
-                        Tensor *output,
-                        StatsFuture *future);
-
- private:
-  std::unique_ptr<OpenCLActivationKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
 }  // namespace kernels
 }  // namespace mace


--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -12,39 +12,43 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ADDN_H_
-#define MACE_KERNELS_ADDN_H_
-
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+
 #include <algorithm>
 #include <memory>
-#include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
+#include "mace/core/operator.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/addn.h"
+#endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

-constexpr int kCostPerGroup = 1024;
+static constexpr int kCostPerGroup = 1024;
+
+template <DeviceType D, class T>
+class AddNOp;

-template <DeviceType D, typename T>
-struct AddNFunctor : OpKernel {
-  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
-                        Tensor *output_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
+template <>
+class AddNOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit AddNOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Tensor *output_tensor = this->Output(0);
+    size_t input_size = this->inputs_.size();
+    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(inputs_[0]));
    index_t size = output_tensor->size();
    Tensor::MappingGuard output_map(output_tensor);
    float *output_data = output_tensor->mutable_data<float>();
    memset(output_data, 0, size * sizeof(float));
-    int n = input_tensors.size();
-    int64_t cost = size * n;
+    int64_t cost = size * input_size;
    int64_t groups = 1;
    if (cost > kCostPerGroup) {
      groups = cost / kCostPerGroup;
@@ -52,8 +56,13 @@ struct AddNFunctor : OpKernel {
    int64_t element_per_group = size / groups;

    std::vector<Tensor::MappingGuard> mappers;
-    for (int64_t i = 0; i < n; ++i) {
-      mappers.emplace_back(Tensor::MappingGuard(input_tensors[i]));
+    for (size_t i = 0; i < input_size; ++i) {
+      MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size());
+      MACE_CHECK(inputs_[0]->size() == inputs_[i]->size())
+        << "Input 0: " << MakeString(inputs_[0]->shape())
+        << ", size: " << inputs_[0]->size() << ". Input " << i << ": "
+        << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size();
+      mappers.emplace_back(Tensor::MappingGuard(inputs_[i]));
    }

 #pragma omp parallel for
@@ -61,8 +70,8 @@ struct AddNFunctor : OpKernel {
      int64_t count = std::min(element_per_group, size - i);
      int nn = count >> 2;
      int remain = count - (nn << 2);
-      for (int64_t j = 0; j < n; ++j) {
-        const float *input_data = input_tensors[j]->data<float>();
+      for (size_t j = 0; j < input_size; ++j) {
+        const float *input_data = inputs_[j]->data<float>();
        const float *input_ptr = input_data + i;
        float *output_ptr = output_data + i;
        for (int k = 0; k < nn; ++k) {
@@ -87,32 +96,51 @@ struct AddNFunctor : OpKernel {
        }
      }
    }
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
 };

 #ifdef MACE_ENABLE_OPENCL
-class OpenCLAddNKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const std::vector<const Tensor *> &input_tensors,
-      Tensor *output_tensor,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel);
-};
 template <typename T>
-struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit AddNFunctor(OpKernelContext *context);
-  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
-                        Tensor *output_tensor,
-                        StatsFuture *future);
+class AddNOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit AddNOp(OpConstructContext *context)
+      : Operation(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::AddNKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    Tensor *output_tensor = this->Output(0);
+    size_t n = this->inputs_.size();
+    for (size_t i = 1; i < n; ++i) {
+      MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size());
+      MACE_CHECK(inputs_[0]->size() == inputs_[i]->size())
+        << "Input 0: " << MakeString(inputs_[0]->shape())
+        << ", size: " << inputs_[0]->size() << ". Input " << i << ": "
+        << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size();
+    }
+
+    return kernel_->Compute(context, inputs_, output_tensor);
+  }

+ private:
  std::unique_ptr<OpenCLAddNKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

+
+void RegisterAddN(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_ADDN_H_
--- a/mace/kernels/argmax.h
+++ b/mace/kernels/argmax.h
@@ -12,32 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ARGMAX_H_
-#define MACE_KERNELS_ARGMAX_H_
-
 #include <algorithm>
 #include <functional>
 #include <limits>
 #include <memory>
 #include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/core/operator.h"

 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
-struct ArgMaxFunctor : OpKernel {
-  explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *axis,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <DeviceType D, class T>
+class ArgMaxOp : public Operation {
+ public:
+  explicit ArgMaxOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    const Tensor *axis = this->Input(1);
+    Tensor *output = this->Output(0);

    MACE_CHECK(input->dim_size() > 0, "ArgMax input should not be a scalar");
    MACE_CHECK(axis->dim_size() == 0, "Mace argmax only supports scalar axis");
@@ -77,11 +73,16 @@ struct ArgMaxFunctor : OpKernel {
      output_data[i] = idx;
    }

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
 };

+
+
+void RegisterArgMax(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp,
+                   DeviceType::CPU, float);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_ARGMAX_H_
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -12,46 +12,50 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_BATCH_NORM_H_
-#define MACE_KERNELS_BATCH_NORM_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
 #include <memory>
+#include <string>
 #include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
+#include "mace/core/operator.h"
 #include "mace/kernels/activation.h"
-#include "mace/public/mace.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/batch_norm.h"
+#endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

-template<DeviceType D, typename T>
-struct BatchNormFunctor;
-
-template<>
-struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
-  BatchNormFunctor(OpKernelContext *context,
-                   const bool folded_constant,
-                   const ActivationType activation,
-                   const float relux_max_limit)
-      : OpKernel(context),
-        folded_constant_(folded_constant),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
-
-  MaceStatus operator()(const Tensor *input,
-                  const Tensor *scale,
-                  const Tensor *offset,
-                  const Tensor *mean,
-                  const Tensor *var,
-                  const float epsilon,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
+template <DeviceType D, class T>
+class BatchNormOp;
+
+template <>
+class BatchNormOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit BatchNormOp(OpConstructContext *context)
+      : Operation(context),
+        epsilon_(Operation::GetOptionalArg<float>("epsilon",
+                                                 static_cast<float>(1e-4))),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    bool not_folded = this->InputSize() == 5;
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *scale = this->Input(SCALE);
+    const Tensor *offset = this->Input(OFFSET);
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
+               input->dim_size());
+    MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
+               scale->dim_size());
+    MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
+               offset->dim_size());
+
+    Tensor *output = this->Output(OUTPUT);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
    // The calculation formula for inference is
    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
@@ -77,7 +81,13 @@ struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {

    std::vector<float> new_scale;
    std::vector<float> new_offset;
-    if (!folded_constant_) {
+    if (not_folded) {
+      const Tensor *mean = this->Input(MEAN);
+      const Tensor *var = this->Input(VAR);
+      MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
+                 mean->dim_size());
+      MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
+                 var->dim_size());
      new_scale.resize(channels);
      new_offset.resize(channels);
      Tensor::MappingGuard mean_mapper(mean);
@@ -86,14 +96,14 @@ struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
      const float *var_ptr = var->data<float>();
 #pragma omp parallel for
      for (index_t c = 0; c < channels; ++c) {
-        new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
+        new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_);
        new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
      }
    }

-    const float *scale_data = folded_constant_ ? scale_ptr : new_scale.data();
+    const float *scale_data = not_folded ? new_scale.data() : scale_ptr;
    const float
-      *offset_data = folded_constant_ ? offset_ptr : new_offset.data();
+        *offset_data = not_folded ? new_offset.data() : offset_ptr;

    index_t channel_size = height * width;
    index_t batch_size = channels * channel_size;
@@ -105,55 +115,95 @@ struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
        index_t offset = b * batch_size + c * channel_size;
        for (index_t hw = 0; hw < height * width; ++hw) {
          output_ptr[offset + hw] =
-            scale_data[c] * input_ptr[offset + hw] + offset_data[c];
+              scale_data[c] * input_ptr[offset + hw] + offset_data[c];
        }
      }
    }
    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_);

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }

-  const bool folded_constant_;
+ private:
+  float epsilon_;
  const ActivationType activation_;
  const float relux_max_limit_;
+
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };

+
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLBatchNormKernel {
+template <typename T>
+class BatchNormOp<DeviceType::GPU, T> : public Operation {
 public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *scale,
-      const Tensor *offset,
-      const Tensor *mean,
-      const Tensor *var,
-      const float epsilon,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel);
-};
-template<typename T>
-struct BatchNormFunctor<DeviceType::GPU, T> : OpKernel {
-  BatchNormFunctor(OpKernelContext *context,
-                   const bool folded_constant,
-                   const ActivationType activation,
-                   const float relux_max_limit);
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *scale,
-                        const Tensor *offset,
-                        const Tensor *mean,
-                        const Tensor *var,
-                        const float epsilon,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit BatchNormOp(OpConstructContext *context)
+      : Operation(context) {
+    float epsilon = Operation::GetOptionalArg<float>(
+        "epsilon", static_cast<float>(1e-4));
+    ActivationType activation = kernels::StringToActivationType(
+        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
+    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BatchNormKernel<T>(
+          epsilon, activation, relux_max_limit));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    bool not_folded = this->InputSize() == 5;
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *scale = this->Input(SCALE);
+    const Tensor *offset = this->Input(OFFSET);
+    const Tensor *mean = not_folded ? this->Input(MEAN) : nullptr;
+    const Tensor *var = not_folded ? this->Input(VAR) : nullptr;
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
+               input->dim_size());
+    MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
+               scale->dim_size());
+    MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
+               offset->dim_size());
+    if (not_folded) {
+      MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
+                 mean->dim_size());
+      MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
+                 var->dim_size());
+    }
+
+    Tensor *output = this->Output(OUTPUT);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    return kernel_->Compute(context, input, scale, offset, mean,
+                            var, output);
+  }
+
+ private:
  std::unique_ptr<OpenCLBatchNormKernel> kernel_;
+
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 #endif  // MACE_ENABLE_OPENCL

+
+void RegisterBatchNorm(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_BATCH_NORM_H_
--- a/mace/kernels/batch_to_space.h
+++ b/mace/kernels/batch_to_space.h
@@ -12,34 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_BATCH_TO_SPACE_H_
-#define MACE_KERNELS_BATCH_TO_SPACE_H_
-
-#include <memory>
-#include <vector>
 #include <algorithm>
+#include <memory>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/batch_to_space.h"
+#endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

-struct BatchToSpaceFunctorBase : OpKernel {
-  BatchToSpaceFunctorBase(OpKernelContext *context,
-                          const std::vector<int> &paddings,
-                          const std::vector<int> &block_shape)
-      : OpKernel(context),
-        paddings_(paddings.begin(), paddings.end()),
-        block_shape_(block_shape.begin(), block_shape.end()) {
+class BatchToSpaceOpBase : public Operation {
+ public:
+  explicit BatchToSpaceOpBase(OpConstructContext *context)
+      : Operation(context),
+        paddings_(Operation::GetRepeatedArgs<int>("crops", {0, 0, 0, 0})),
+        block_shape_(Operation::GetRepeatedArgs<int>("block_shape", {1, 1})) {
    MACE_CHECK(
-        block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
+        block_shape_.size() == 2 && block_shape_[0] > 1 && block_shape_[1] > 1,
        "Block's shape should be 1D, and greater than 1");
-    MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
+    MACE_CHECK(paddings_.size() == 4, "Paddings' shape should be 2D");
  }

+ protected:
  std::vector<int> paddings_;
  std::vector<int> block_shape_;

@@ -83,21 +79,19 @@ struct BatchToSpaceFunctorBase : OpKernel {
  }
 };

-template<DeviceType D, typename T>
-struct BatchToSpaceFunctor;
-
-template<>
-struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
-  BatchToSpaceFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
-      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
+template <DeviceType D, class T>
+class BatchToSpaceNDOp;

-  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *space_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
+ public:
+  explicit BatchToSpaceNDOp(OpConstructContext *context)
+      : BatchToSpaceOpBase(context) {}

+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *batch_tensor = this->Input(0);
+    Tensor *space_tensor = this->Output(0);
    std::vector<index_t> output_shape(4, 0);
    CalculateBatchToSpaceOutputShape(batch_tensor,
                                     DataFormat::NCHW,
@@ -177,24 +171,21 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
      }  // block_h
    }  // c

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
 };

-template<>
-struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
-  BatchToSpaceFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
-      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
-
-  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *space_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
+ public:
+  explicit BatchToSpaceNDOp(OpConstructContext *context)
+      : BatchToSpaceOpBase(context) {}

+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *batch_tensor = this->Input(0);
+    Tensor *space_tensor = this->Output(0);
    std::vector<index_t> output_shape(4, 0);
-
    CalculateBatchToSpaceOutputShape(batch_tensor,
                                     DataFormat::NHWC,
                                     output_shape.data());
@@ -264,38 +255,53 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
      }  // h
    }  // b

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
 };

 #ifdef MACE_ENABLE_OPENCL
-class OpenCLBatchToSpaceKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *batch_tensor,
-      const std::vector<int> &paddings,
-      const std::vector<int> &block_shape,
-      const std::vector<index_t> &output_shape,
-      Tensor *space_tensor,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel);
-};
 template <typename T>
-struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
-  BatchToSpaceFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape);
-
-  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *space_tensor,
-                        StatsFuture *future);
+class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
+ public:
+  explicit BatchToSpaceNDOp(OpConstructContext *context)
+      : BatchToSpaceOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *batch_tensor = this->Input(0);
+    Tensor *space_tensor = this->Output(0);
+    std::vector<index_t> output_shape(4, 0);
+    CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
+                                     output_shape.data());
+    return kernel_->Compute(context, batch_tensor, paddings_, block_shape_,
+                            output_shape, space_tensor);
+  }

+ private:
  std::unique_ptr<OpenCLBatchToSpaceKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

+
+void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_BATCH_TO_SPACE_H_
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -12,43 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_BIAS_ADD_H_
-#define MACE_KERNELS_BIAS_ADD_H_
-
 #include <functional>
 #include <memory>
 #include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
+#include "mace/kernels/activation.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/bias_add.h"
+#endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

-struct BiasAddFunctorBase : OpKernel {
-  BiasAddFunctorBase(OpKernelContext *context,
-                     const DataFormat data_format)
-      : OpKernel(context), data_format_(data_format) {}
+template <DeviceType D, class T>
+class BiasAddOp;

-  DataFormat data_format_;
-};
+template <>
+class BiasAddOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit BiasAddOp(OpConstructContext *context)
+      : Operation(context),
+        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
+                     "data_format", NHWC))) {}

-template <DeviceType D, typename T>
-struct BiasAddFunctor;
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    const Tensor *bias = this->Input(1);

-template <>
-struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
-  BiasAddFunctor(OpKernelContext *context,
-                 const DataFormat data_format)
-      : BiasAddFunctorBase(context, data_format) {}
+    MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ",
+               bias->dim_size());

-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));

    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard bias_mapper(bias);
@@ -87,35 +84,60 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
      }
    }

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
-};

-#ifdef MACE_ENABLE_OPENCL
-class OpenCLBiasAddKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *bias,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel);
+ private:
+  DataFormat data_format_;
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
-  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format);
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future);
+class BiasAddOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit BiasAddOp(OpConstructContext *context)
+      : Operation(context),
+        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
+            "data_format", NHWC))) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BiasAddKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *bias = this->Input(1);
+
+    MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ",
+               bias->dim_size());
+
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
+               "gpu only support biasadd for 4-dimensional NHWC format tensor");
+    return kernel_->Compute(context, input, bias, output);
+  }

+ private:
+  DataFormat data_format_;
  std::unique_ptr<OpenCLBiasAddKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

+
+void RegisterBiasAdd(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_BIAS_ADD_H_
--- a/mace/kernels/opencl/buffer_inverse_transform.cc
+++ b/mace/kernels/opencl/buffer_inverse_transform.cc
@@ -12,38 +12,56 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/buffer_inverse_transform.h"
-#include "mace/kernels/opencl/image/image_to_buffer.h"
+#include <memory>
+
+#include "mace/core/operator.h"
 #include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
+#include "mace/kernels/opencl/image/image_to_buffer.h"

 namespace mace {
 namespace kernels {

-template<typename T>
-BufferInverseTransformFunctor<
-    DeviceType::GPU, T>::BufferInverseTransformFunctor(
-    OpKernelContext *context,
-    const int wino_blk_size)
-  : BufferInverseTransformFunctorBase(context, wino_blk_size) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::ImageToBuffer<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
-  }
-}
+template <DeviceType D, class T>
+class BufferInverseTransformOp;

 template <typename T>
-MaceStatus BufferInverseTransformFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const BufferType type,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, type,
-                          wino_blk_size_, output, future);
-}
+class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit BufferInverseTransformOp(OpConstructContext *context)
+      : Operation(context),
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ImageToBuffer<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);

-template struct BufferInverseTransformFunctor<DeviceType::GPU, float>;
-template struct BufferInverseTransformFunctor<DeviceType::GPU, half>;
+    kernels::BufferType type =
+        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+
+    return kernel_->Compute(context, input, type,
+                            wino_blk_size_, output);
+  }
+
+ private:
+  const int wino_blk_size_;
+  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
+};
+
+
+void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
+                   BufferInverseTransformOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
+                   BufferInverseTransformOp, DeviceType::GPU, half);
+}

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/buffer_inverse_transform.h
+++ b/mace/kernels/buffer_inverse_transform.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/kernels/opencl/common.h"
-
-namespace mace {
-namespace kernels {
-
-struct BufferInverseTransformFunctorBase : OpKernel {
-  BufferInverseTransformFunctorBase(OpKernelContext *context,
-                                    const int wino_blk_size)
-    : OpKernel(context),
-      wino_blk_size_(wino_blk_size) {}
-  const int wino_blk_size_;
-};
-
-template <DeviceType D, typename T>
-struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase {
-  explicit BufferInverseTransformFunctor(OpKernelContext *context,
-                                         const int wino_blk_size)
-    : BufferInverseTransformFunctorBase(context, wino_blk_size) {}
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(input);
-    MACE_UNUSED(type);
-    MACE_UNUSED(output);
-    MACE_UNUSED(future);
-    MACE_NOT_IMPLEMENTED;
-    return MACE_SUCCESS;
-  }
-};
-
-class OpenCLBufferInverseTransformKernel {
- public:
-  virtual MaceStatus Compute(OpKernelContext *context,
-                             const Tensor *input,
-                             const BufferType type,
-                             const int wino_blk_size,
-                             Tensor *output,
-                             StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
-};
-
-template <typename T>
-struct BufferInverseTransformFunctor<DeviceType::GPU, T>
-    : BufferInverseTransformFunctorBase {
-  explicit BufferInverseTransformFunctor(OpKernelContext *context,
-                                         const int wino_blk_size);
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
--- a/mace/kernels/opencl/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer_transform.cc
@@ -12,37 +12,56 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/buffer_transform.h"
-#include "mace/kernels/opencl/image/buffer_to_image.h"
+#include <memory>
+
+#include "mace/core/operator.h"
 #include "mace/kernels/opencl/buffer/buffer_transform.h"
+#include "mace/kernels/opencl/image/buffer_to_image.h"

 namespace mace {
 namespace kernels {

-template<typename T>
-BufferTransformFunctor<DeviceType::GPU, T>::BufferTransformFunctor(
-    OpKernelContext *context,
-    const int wino_blk_size)
-  : BufferTransformFunctorBase(context, wino_blk_size) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::BufferToImage<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::BufferTransform<T>);
-  }
-}
+template <DeviceType D, class T>
+class BufferTransformOp;

 template <typename T>
-MaceStatus BufferTransformFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const BufferType type,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, type,
-                          wino_blk_size_, output, future);
-}
+class BufferTransformOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit BufferTransformOp(OpConstructContext *context)
+      : Operation(context),
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BufferToImage<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::BufferTransform<T>);
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);

-template struct BufferTransformFunctor<DeviceType::GPU, float>;
-template struct BufferTransformFunctor<DeviceType::GPU, half>;
+    kernels::BufferType type =
+        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+
+    return kernel_->Compute(context, input, type,
+                            wino_blk_size_, output);
+  }
+
+ private:
+  const int wino_blk_size_;
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
+};
+
+
+void RegisterBufferTransform(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BufferTransform",
+                   BufferTransformOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BufferTransform",
+                   BufferTransformOp, DeviceType::GPU, half);
+}

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/buffer_transform.h
+++ b/mace/kernels/buffer_transform.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_
-#define MACE_KERNELS_BUFFER_TRANSFORM_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/kernels/opencl/common.h"
-
-namespace mace {
-namespace kernels {
-
-struct BufferTransformFunctorBase : OpKernel {
-  explicit BufferTransformFunctorBase(OpKernelContext *context,
-                                      const int wino_blk_size)
-    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
-  const int wino_blk_size_;
-};
-
-template <DeviceType D, typename T>
-struct BufferTransformFunctor : BufferTransformFunctorBase {
-  BufferTransformFunctor(OpKernelContext *context,
-                         const int wino_blk_size)
-      : BufferTransformFunctorBase(context, wino_blk_size) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(input);
-    MACE_UNUSED(type);
-    MACE_UNUSED(output);
-    MACE_UNUSED(future);
-    MACE_NOT_IMPLEMENTED;
-    return MACE_SUCCESS;
-  }
-};
-
-class OpenCLBufferTransformKernel {
- public:
-  virtual MaceStatus Compute(OpKernelContext *context,
-                             const Tensor *input,
-                             const BufferType type,
-                             const int wino_blk_size,
-                             Tensor *output,
-                             StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel)
-};
-
-template <typename T>
-struct BufferTransformFunctor<DeviceType::GPU, T> : BufferTransformFunctorBase {
-  BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size);
-
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_BUFFER_TRANSFORM_H_
--- a/mace/ops/cast.h
+++ b/mace/ops/cast.h
@@ -12,24 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_CAST_H_
-#define MACE_OPS_CAST_H_
-
-#include <vector>
-
 #include "mace/core/operator.h"

 namespace mace {
-namespace ops {
+namespace kernels {

 template <DeviceType D, typename SrcType>
-class CastOp : public Operator<D, SrcType> {
+class CastOp : public Operation {
 public:
-  CastOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, SrcType>(op_def, context) {}
+  explicit CastOp(OpConstructContext *context)
+      : Operation(context) {}

-  MaceStatus Run(StatsFuture *future) override {
-    MACE_UNUSED(future);
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
    MACE_RETURN_IF_ERROR(output->ResizeLike(input))
@@ -47,7 +42,7 @@ class CastOp : public Operator<D, SrcType> {

    MACE_RUN_WITH_TYPE_ENUM(dst_dtype, MACE_CAST_COPY);

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }

 private:
@@ -55,7 +50,12 @@ class CastOp : public Operator<D, SrcType> {
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };

-}  // namespace ops
-}  // namespace mace
+void RegisterCast(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Cast", CastOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Cast", CastOp,
+                   DeviceType::CPU, int32_t);
+}

-#endif  // MACE_OPS_CAST_H_
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -12,28 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
-#define MACE_KERNELS_CHANNEL_SHUFFLE_H_
-
 #include <memory>
-#include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/channel_shuffle.h"
+#endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

-template<DeviceType D, typename T>
-struct ChannelShuffleFunctor : OpKernel {
-  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
-      : OpKernel(context), groups_(groups) {}
+template <DeviceType D, class T>
+class ChannelShuffleOp;

-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <typename T>
+class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit ChannelShuffleOp(OpConstructContext *context)
+      : Operation(context),
+        groups_(Operation::GetOptionalArg<int>("group", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim(1) % groups_ == 0,
+               "input channels must be an integral multiple of group. ",
+               input->dim(1));
    MACE_RETURN_IF_ERROR(output->ResizeLike(input));

    Tensor::MappingGuard logits_guard(input);
@@ -64,35 +69,51 @@ struct ChannelShuffleFunctor : OpKernel {
      }
    }

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }

+ private:
  const int groups_;
 };

+
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLChannelShuffleKernel {
+template <typename T>
+class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
 public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel);
-};
-template<typename T>
-struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
-  ChannelShuffleFunctor(OpKernelContext *context, const int groups);
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit ChannelShuffleOp(OpConstructContext *context)
+      : Operation(context) {
+    const int groups = Operation::GetOptionalArg<int>("group", 1);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    return kernel_->Compute(context, input, output);
+  }

+ private:
  std::unique_ptr<OpenCLChannelShuffleKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

+
+void RegisterChannelShuffle(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
+                   ChannelShuffleOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
+                   ChannelShuffleOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
+                   ChannelShuffleOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_CHANNEL_SHUFFLE_H_
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -12,33 +12,54 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_CONCAT_H_
-#define MACE_KERNELS_CONCAT_H_
-
 #include <memory>
-#include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 #include "mace/utils/quantize.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/concat.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
-struct ConcatFunctor : OpKernel {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
+class ConcatOpBase : public Operation {
+ public:
+  explicit ConcatOpBase(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
+
+ protected:
+  void Validate() {
+    const int32_t input_dims = this->Input(0)->dim_size();
+    axis_ =
+        axis_ < 0 ? axis_ + input_dims : axis_;
+    MACE_CHECK((0 <= axis_ && axis_ < input_dims),
+               "Expected concatenating axis in the range [", -input_dims, ", ",
+               input_dims, "], but got ", axis_);
+  }
+
+ protected:
+  int axis_;
+};
+
+template <DeviceType D, class T>
+class ConcatOp;
+
+template <typename T>
+class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
+ public:
+  explicit ConcatOp(OpConstructContext *context)
+      : ConcatOpBase(context) {}

-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    const Tensor *input0 = input_list.front();
-    const size_t inputs_count = input_list.size();
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Validate();
+    const std::vector<const Tensor *> &inputs = this->Inputs();
+    Tensor *output = this->Output(0);
+    const Tensor *input0 = inputs.front();
+    const size_t inputs_count = inputs.size();

    std::vector<index_t> output_shape(input0->shape());
    index_t inner_size = 1;
@@ -48,7 +69,7 @@ struct ConcatFunctor : OpKernel {
    std::vector<index_t> outer_sizes(inputs_count, 0);
    outer_sizes[0] = input0->size() / inner_size;
    for (size_t i = 1; i < inputs_count; ++i) {
-      const Tensor *input = input_list[i];
+      const Tensor *input = inputs[i];
      MACE_CHECK(input->dim_size() == input0->dim_size(),
                 "Ranks of all input tensors must be same.");
      for (int j = 0; j < input->dim_size(); ++j) {
@@ -65,9 +86,9 @@ struct ConcatFunctor : OpKernel {

    T *output_ptr = output->mutable_data<T>();

-    std::vector<const T *> input_ptrs(input_list.size(), nullptr);
+    std::vector<const T *> input_ptrs(inputs.size(), nullptr);
    for (size_t i = 0; i < inputs_count; ++i) {
-      input_ptrs[i] = input_list[i]->data<T>();
+      input_ptrs[i] = inputs[i]->data<T>();
    }
    for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
      for (size_t i = 0; i < inputs_count; ++i) {
@@ -83,24 +104,24 @@ struct ConcatFunctor : OpKernel {
      }
    }

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
-
-  int32_t axis_;
 };

-template<>
-struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
+ public:
+  explicit ConcatOp(OpConstructContext *context)
+      : ConcatOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Validate();
+    const std::vector<const Tensor *> &inputs = this->Inputs();
+    Tensor *output = this->Output(0);
    MACE_CHECK(output->scale() != 0);
-    const Tensor *input0 = input_list.front();
-    const size_t inputs_count = input_list.size();
+    const Tensor *input0 = inputs.front();
+    const size_t inputs_count = inputs.size();

    std::vector<index_t> output_shape(input0->shape());
    index_t inner_size = 1;
@@ -110,7 +131,7 @@ struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {
    std::vector<index_t> outer_sizes(inputs_count, 0);
    outer_sizes[0] = input0->size() / inner_size;
    for (size_t i = 1; i < inputs_count; ++i) {
-      const Tensor *input = input_list[i];
+      const Tensor *input = inputs[i];
      MACE_CHECK(input->dim_size() == input0->dim_size(),
                 "Ranks of all input tensors must be same.");
      for (int j = 0; j < input->dim_size(); ++j) {
@@ -127,22 +148,22 @@ struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {

    auto output_ptr = output->mutable_data<uint8_t>();

-    std::vector<const uint8_t *> input_ptrs(input_list.size(), nullptr);
+    std::vector<const uint8_t *> input_ptrs(inputs.size(), nullptr);
    for (size_t i = 0; i < inputs_count; ++i) {
-      input_ptrs[i] = input_list[i]->data<uint8_t>();
+      input_ptrs[i] = inputs[i]->data<uint8_t>();
    }

    for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
      for (size_t i = 0; i < inputs_count; ++i) {
-        if (input_list[i]->zero_point() == output->zero_point()
-            && input_list[i]->scale() == output->scale()) {
+        if (inputs[i]->zero_point() == output->zero_point()
+            && inputs[i]->scale() == output->scale()) {
          memcpy(output_ptr, input_ptrs[i], outer_sizes[i] * sizeof(uint8_t));
          output_ptr += outer_sizes[i];
          input_ptrs[i] += outer_sizes[i];
        } else {
-          const float scale = input_list[i]->scale() / output->scale();
+          const float scale = inputs[i]->scale() / output->scale();
          const float offset =
-              -input_list[i]->zero_point() * scale + output->zero_point();
+              -inputs[i]->zero_point() * scale + output->zero_point();
          for (index_t k = 0; k < outer_sizes[i]; ++k) {
            float out = (*input_ptrs[i]) * scale + offset;
            *output_ptr = Saturate<uint8_t>(roundf(out));
@@ -153,35 +174,49 @@ struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {
      }
    }

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }
-
-  int32_t axis_;
 };

 #ifdef MACE_ENABLE_OPENCL
-class OpenCLConcatKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const std::vector<const Tensor *> &input_list,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel);
-};
 template <typename T>
-struct ConcatFunctor<DeviceType::GPU, T> : OpKernel {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis);
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future);
+class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
+ public:
+  explicit ConcatOp(OpConstructContext *context)
+      : ConcatOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ConcatKernel<T>(axis_));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    Validate();
+    Tensor *output = this->Output(0);
+    return kernel_->Compute(context, inputs_, output);
+  }

+ private:
  std::unique_ptr<OpenCLConcatKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

+
+void RegisterConcat(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_CONCAT_H_
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_CONV_POOL_2D_BASE_H_
-#define MACE_OPS_CONV_POOL_2D_BASE_H_
+#ifndef MACE_KERNELS_CONV_POOL_2D_BASE_H_
+#define MACE_KERNELS_CONV_POOL_2D_BASE_H_

 #include <vector>

@@ -21,18 +21,17 @@
 #include "mace/kernels/conv_pool_2d_util.h"

 namespace mace {
-namespace ops {
+namespace kernels {

-template <DeviceType D, class T>
-class ConvPool2dOpBase : public Operator<D, T> {
+class ConvPool2dOpBase : public Operation {
 public:
-  ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        strides_(OperatorBase::GetRepeatedArgs<int>("strides")),
-        padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
+  explicit ConvPool2dOpBase(OpConstructContext *context)
+      : Operation(context),
+        strides_(Operation::GetRepeatedArgs<int>("strides")),
+        padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
            "padding", static_cast<int>(SAME)))),
-        paddings_(OperatorBase::GetRepeatedArgs<int>("padding_values")),
-        dilations_(OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
+        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
+        dilations_(Operation::GetRepeatedArgs<int>("dilations", {1, 1})) {}

 protected:
  std::vector<int> strides_;
@@ -41,7 +40,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
  std::vector<int> dilations_;
 };

-}  // namespace ops
+}  // namespace kernels
 }  // namespace mace

-#endif  // MACE_OPS_CONV_POOL_2D_BASE_H_
+#endif  // MACE_KERNELS_CONV_POOL_2D_BASE_H_
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -362,7 +362,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
    }
  }

-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }

 MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
@@ -408,7 +408,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
    }
  }

-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }


@@ -460,7 +460,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
    }
  }

-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }

 }  // namespace kernels

--- a/mace/kernels/crop.h
+++ b/mace/kernels/crop.h
@@ -12,65 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_CROP_H_
-#define MACE_KERNELS_CROP_H_
-
 #include <memory>
-#include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/crop.h"
+#endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
-struct CropFunctor : OpKernel {
-  CropFunctor(OpKernelContext *context,
-              const int axis,
-              const std::vector<int> &offset)
-      : OpKernel(context),
-        axis_(axis),
-        offset_(offset) {}
-
-  void crop_copy(const T* input_data, T* output_data,
-                 const std::vector<index_t> &input_shape,
-                 const std::vector<index_t> &output_shape,
-                 const int32_t* offsets) {
-    const index_t out_img_size =
-        output_shape[1] * output_shape[2] * output_shape[3];
-    const index_t out_hw = output_shape[2] * output_shape[3];
-    const index_t in_img_size =
-        input_shape[1] * input_shape[2] * input_shape[3];
-    const index_t in_hw = input_shape[2] * input_shape[3];
-#pragma omp parallel for collapse(3)
-    for (int b = 0; b < output_shape[0]; ++b) {
-      for (int c = 0; c < output_shape[1]; ++c) {
-        for (int h = 0; h < output_shape[2]; ++h) {
-          T* out_ptr =
-              output_data + b * out_img_size + c * out_hw + h * output_shape[3];
-          const T* in_ptr_bch =
-              input_data + (b + offsets[0]) * in_img_size +
-                  (c + offsets[1]) * in_hw +
-                  (h + offsets[2]) * input_shape[3] + offsets[3];
-          memcpy(out_ptr, in_ptr_bch,
-                 output_shape[3] * sizeof(T));
-        }
-      }
-    }
-  }
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
-
-    MACE_CHECK(input_list.size() == 2, "Crop op needs two inputs.");
-    const Tensor *input0 = input_list[0];
-    const Tensor *input1 = input_list[1];
+template <DeviceType D, class T>
+class CropOp : public Operation {
+ public:
+  explicit CropOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 2)),
+        offset_(Operation::GetRepeatedArgs<int>("offset")) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
+    Tensor *output = this->Output(0);
+    const Tensor *input0 = inputs_[0];
+    const Tensor *input1 = inputs_[1];
    const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
    const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());

@@ -91,8 +56,8 @@ struct CropFunctor : OpKernel {
          crop_offset = offset_[i - axis_];
        }
        MACE_CHECK(input0->dim(i) - crop_offset >= input1->dim(i))
-        << "the crop for dimension" << i << "is out of bound with size"
-        << input1->dim(i) << "and offset" << crop_offset;
+          << "the crop for dimension" << i << "is out of bound with size"
+          << input1->dim(i) << "and offset" << crop_offset;
      }
      output_shape[i] = new_size;
      offsets[i] = crop_offset;
@@ -105,37 +70,78 @@ struct CropFunctor : OpKernel {
    crop_copy(input_data, output_data, input0->shape(),
              output_shape, offsets.data());

-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  void crop_copy(const T* input_data, T* output_data,
+                 const std::vector<index_t> &input_shape,
+                 const std::vector<index_t> &output_shape,
+                 const int32_t* offsets) {
+    const index_t out_img_size =
+        output_shape[1] * output_shape[2] * output_shape[3];
+    const index_t out_hw = output_shape[2] * output_shape[3];
+    const index_t in_img_size =
+        input_shape[1] * input_shape[2] * input_shape[3];
+    const index_t in_hw = input_shape[2] * input_shape[3];
+#pragma omp parallel for collapse(3)
+    for (int b = 0; b < output_shape[0]; ++b) {
+      for (int c = 0; c < output_shape[1]; ++c) {
+        for (int h = 0; h < output_shape[2]; ++h) {
+          T* out_ptr =
+              output_data + b * out_img_size + c * out_hw + h * output_shape[3];
+          const T* in_ptr_bch =
+              input_data + (b + offsets[0]) * in_img_size +
+                  (c + offsets[1]) * in_hw +
+                  (h + offsets[2]) * input_shape[3] + offsets[3];
+          memcpy(out_ptr, in_ptr_bch,
+                 output_shape[3] * sizeof(T));
+        }
+      }
+    }
  }

+ private:
  const int axis_;
  std::vector<int> offset_;
 };

 #ifdef MACE_ENABLE_OPENCL
-class OpenCLCropKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const std::vector<const Tensor *> &input_list,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel);
-};
 template <typename T>
-struct CropFunctor<DeviceType::GPU, T> : OpKernel {
-  CropFunctor(OpKernelContext *context,
-              const int axis,
-              const std::vector<int> &offset);
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future);
+class CropOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit CropOp(OpConstructContext *context)
+      : Operation(context) {
+    const int axis = Operation::GetOptionalArg<int>("axis", 2);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::CropKernel<T>(
+          axis, Operation::GetRepeatedArgs<int>("offset")));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    return kernel_->Compute(context, inputs_, this->Output(0));
+  }
+
+ private:
  std::unique_ptr<OpenCLCropKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

+
+void RegisterCrop(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_CROP_H_
--- a/mace/kernels/deconv_2d.cc
+++ b/mace/kernels/deconv_2d.cc
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -12,32 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_DEPTH_TO_SPACE_H_
-#define MACE_KERNELS_DEPTH_TO_SPACE_H_
 #include <memory>
 #include <vector>

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
-
+#include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/depth_to_space.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

-template<DeviceType D, typename T>
-struct DepthToSpaceOpFunctor : OpKernel {
-  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size)
-      : OpKernel(context), block_size_(block_size) {}
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <DeviceType D, class T>
+class DepthToSpaceOp : public Operation {
+ public:
+  explicit DepthToSpaceOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
    const index_t batch_size = input->dim(0);
    const index_t input_depth = input->dim(1);
    const index_t input_height = input->dim(2);
@@ -85,36 +82,50 @@ struct DepthToSpaceOpFunctor : OpKernel {
      }
    }

-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
  }

+ private:
  const int block_size_;
 };

 #ifdef MACE_ENABLE_OPENCL
-class OpenCLDepthToSpaceKernel {
+template <typename T>
+class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
 public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel);
-};
-template<typename T>
-struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
-  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size);
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit DepthToSpaceOp(OpConstructContext *context)
+      : Operation(context) {
+    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    return kernel_->Compute(context, input, output);
+  }

+ private:
  std::unique_ptr<OpenCLDepthToSpaceKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

+void RegisterDepthToSpace(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_DEPTH_TO_SPACE_H_
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
--- a/mace/kernels/eltwise.cc
+++ b/mace/kernels/eltwise.cc
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
--- a/mace/kernels/expand_dims.h
+++ b/mace/kernels/expand_dims.h
--- a/mace/kernels/fill.h
+++ b/mace/kernels/fill.h
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
--- a/mace/kernels/gather.h
+++ b/mace/kernels/gather.h
--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/kernels/gemmlowp_util.h
@@ -18,7 +18,8 @@
 #include <tuple>

 #include "public/gemmlowp.h"
-#include "mace/kernels/quantize.h"
+#include "mace/core/types.h"
+#include "mace/utils/quantize.h"

 namespace mace {


--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
--- a/mace/ops/infer_conv2d_shape.h
+++ b/mace/ops/infer_conv2d_shape.h
--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
--- a/mace/ops/lstmcell.h
+++ b/mace/ops/lstmcell.h
--- a/mace/kernels/lstmcell.h
+++ b/mace/kernels/lstmcell.h
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
--- a/mace/kernels/opencl/activation.h
+++ b/mace/kernels/opencl/activation.h
--- a/mace/kernels/opencl/addn.h
+++ b/mace/kernels/opencl/addn.h
--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
--- a/mace/kernels/opencl/batch_norm.h
+++ b/mace/kernels/opencl/batch_norm.h
--- a/mace/kernels/opencl/batch_to_space.cc
+++ b/mace/kernels/opencl/batch_to_space.cc
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
--- a/mace/kernels/opencl/bias_add.h
+++ b/mace/kernels/opencl/bias_add.h
--- a/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_inverse_transform.h
--- a/mace/kernels/opencl/buffer/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_transform.cc
--- a/mace/kernels/opencl/buffer/buffer_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_transform.h
--- a/mace/kernels/opencl/buffer/buffer_type_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_type_transform.cc
--- a/mace/kernels/opencl/buffer/conv_2d.h
+++ b/mace/kernels/opencl/buffer/conv_2d.h
--- a/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_1x1.cc
--- a/mace/kernels/opencl/buffer/conv_2d_general.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_general.cc
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.cc
@@ -23,7 +23,7 @@ namespace opencl {
 namespace buffer {
 namespace depthwise {

-MaceStatus DepthwiseConv2d(OpKernelContext *context,
+MaceStatus DepthwiseConv2d(OpContext *context,
                           cl::Kernel *kernel,
                           const Tensor *padded_input,   // NHWC
                           const Tensor *filter,  // HWIM
@@ -127,7 +127,7 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context,
                                           gws, lws, future));

  MACE_OUT_OF_RANGE_VALIDATION
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }

 }  // namespace depthwise

--- a/mace/kernels/opencl/buffer/depthwise_conv2d.h
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.h
--- a/mace/kernels/opencl/buffer/pooling.h
+++ b/mace/kernels/opencl/buffer/pooling.h
--- a/mace/kernels/opencl/buffer/softmax.h
+++ b/mace/kernels/opencl/buffer/softmax.h
--- a/mace/kernels/opencl/buffer/utils.cc
+++ b/mace/kernels/opencl/buffer/utils.cc
--- a/mace/kernels/opencl/buffer/utils.h
+++ b/mace/kernels/opencl/buffer/utils.h
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
--- a/mace/kernels/opencl/buffer_transform.h
+++ b/mace/kernels/opencl/buffer_transform.h
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
--- a/mace/kernels/opencl/channel_shuffle.h
+++ b/mace/kernels/opencl/channel_shuffle.h
--- a/mace/kernels/opencl/concat.h
+++ b/mace/kernels/opencl/concat.h
--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
--- a/mace/kernels/opencl/conv_2d.h
+++ b/mace/kernels/opencl/conv_2d.h
--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
--- a/mace/kernels/opencl/crop.h
+++ b/mace/kernels/opencl/crop.h
--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
--- a/mace/kernels/opencl/deconv_2d.h
+++ b/mace/kernels/opencl/deconv_2d.h
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
--- a/mace/kernels/opencl/depth_to_space.h
+++ b/mace/kernels/opencl/depth_to_space.h
--- a/mace/kernels/opencl/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/depthwise_conv2d.cc
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
--- a/mace/kernels/opencl/eltwise.h
+++ b/mace/kernels/opencl/eltwise.h
--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
--- a/mace/kernels/opencl/fully_connected.h
+++ b/mace/kernels/opencl/fully_connected.h
--- a/mace/kernels/opencl/image/activation.h
+++ b/mace/kernels/opencl/image/activation.h
--- a/mace/kernels/opencl/image/addn.h
+++ b/mace/kernels/opencl/image/addn.h
--- a/mace/kernels/opencl/image/batch_norm.h
+++ b/mace/kernels/opencl/image/batch_norm.h
--- a/mace/kernels/opencl/image/batch_to_space.h
+++ b/mace/kernels/opencl/image/batch_to_space.h
--- a/mace/kernels/opencl/image/bias_add.h
+++ b/mace/kernels/opencl/image/bias_add.h
--- a/mace/kernels/opencl/image/buffer_to_image.h
+++ b/mace/kernels/opencl/image/buffer_to_image.h
--- a/mace/kernels/opencl/image/channel_shuffle.h
+++ b/mace/kernels/opencl/image/channel_shuffle.h
--- a/mace/kernels/opencl/image/concat.cc
+++ b/mace/kernels/opencl/image/concat.cc
--- a/mace/kernels/opencl/image/concat.h
+++ b/mace/kernels/opencl/image/concat.h
--- a/mace/kernels/opencl/image/conv_2d.h
+++ b/mace/kernels/opencl/image/conv_2d.h
--- a/mace/kernels/opencl/image/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/image/conv_2d_1x1.cc
--- a/mace/kernels/opencl/image/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/image/conv_2d_3x3.cc
--- a/mace/kernels/opencl/image/conv_2d_general.cc
+++ b/mace/kernels/opencl/image/conv_2d_general.cc
--- a/mace/kernels/opencl/image/crop.h
+++ b/mace/kernels/opencl/image/crop.h
--- a/mace/kernels/opencl/image/deconv_2d.h
+++ b/mace/kernels/opencl/image/deconv_2d.h
--- a/mace/kernels/opencl/image/depth_to_space.h
+++ b/mace/kernels/opencl/image/depth_to_space.h
--- a/mace/kernels/opencl/image/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/image/depthwise_conv2d.cc
--- a/mace/kernels/opencl/image/depthwise_conv2d.h
+++ b/mace/kernels/opencl/image/depthwise_conv2d.h
--- a/mace/kernels/opencl/image/eltwise.h
+++ b/mace/kernels/opencl/image/eltwise.h
--- a/mace/kernels/opencl/image/fully_connected.h
+++ b/mace/kernels/opencl/image/fully_connected.h
--- a/mace/kernels/opencl/image/image_to_buffer.h
+++ b/mace/kernels/opencl/image/image_to_buffer.h
--- a/mace/kernels/opencl/image/lstm_cell.h
+++ b/mace/kernels/opencl/image/lstm_cell.h
--- a/mace/kernels/opencl/image/matmul.h
+++ b/mace/kernels/opencl/image/matmul.h
--- a/mace/kernels/opencl/image/pad.h
+++ b/mace/kernels/opencl/image/pad.h
--- a/mace/kernels/opencl/image/pooling.h
+++ b/mace/kernels/opencl/image/pooling.h
--- a/mace/kernels/opencl/image/reduce_mean.h
+++ b/mace/kernels/opencl/image/reduce_mean.h
--- a/mace/kernels/opencl/image/resize_bicubic.h
+++ b/mace/kernels/opencl/image/resize_bicubic.h
--- a/mace/kernels/opencl/image/resize_bilinear.h
+++ b/mace/kernels/opencl/image/resize_bilinear.h
--- a/mace/kernels/opencl/image/softmax.h
+++ b/mace/kernels/opencl/image/softmax.h
--- a/mace/kernels/opencl/image/space_to_batch.h
+++ b/mace/kernels/opencl/image/space_to_batch.h
--- a/mace/kernels/opencl/image/space_to_depth.h
+++ b/mace/kernels/opencl/image/space_to_depth.h
--- a/mace/kernels/opencl/image/split.h
+++ b/mace/kernels/opencl/image/split.h
--- a/mace/kernels/opencl/image/sqrdiff_mean.h
+++ b/mace/kernels/opencl/image/sqrdiff_mean.h
--- a/mace/kernels/opencl/image/winograd_transform.h
+++ b/mace/kernels/opencl/image/winograd_transform.h
--- a/mace/kernels/opencl/lstm_cell.h
+++ b/mace/kernels/opencl/lstm_cell.h
--- a/mace/kernels/opencl/lstmcell.cc
+++ b/mace/kernels/opencl/lstmcell.cc
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
--- a/mace/kernels/opencl/matmul.h
+++ b/mace/kernels/opencl/matmul.h
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
--- a/mace/kernels/opencl/pad.h
+++ b/mace/kernels/opencl/pad.h
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
--- a/mace/kernels/opencl/pooling.h
+++ b/mace/kernels/opencl/pooling.h
--- a/mace/kernels/opencl/reduce_mean.cc
+++ b/mace/kernels/opencl/reduce_mean.cc
--- a/mace/kernels/opencl/reduce_mean.h
+++ b/mace/kernels/opencl/reduce_mean.h
--- a/mace/kernels/opencl/resize_bicubic.cc
+++ b/mace/kernels/opencl/resize_bicubic.cc
--- a/mace/kernels/opencl/resize_bicubic.h
+++ b/mace/kernels/opencl/resize_bicubic.h
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
--- a/mace/kernels/opencl/resize_bilinear.h
+++ b/mace/kernels/opencl/resize_bilinear.h
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
--- a/mace/kernels/opencl/softmax.h
+++ b/mace/kernels/opencl/softmax.h
--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
--- a/mace/ops/identity.h
+++ b/mace/ops/identity.h
--- a/mace/kernels/opencl/space_to_depth.cc
+++ b/mace/kernels/opencl/space_to_depth.cc
--- a/mace/kernels/opencl/space_to_depth.h
+++ b/mace/kernels/opencl/space_to_depth.h
--- a/mace/kernels/opencl/split.cc
+++ b/mace/kernels/opencl/split.cc
--- a/mace/kernels/opencl/split.h
+++ b/mace/kernels/opencl/split.h
--- a/mace/kernels/opencl/sqrdiff_mean.cc
+++ b/mace/kernels/opencl/sqrdiff_mean.cc
--- a/mace/kernels/opencl/sqrdiff_mean.h
+++ b/mace/kernels/opencl/sqrdiff_mean.h
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
--- a/mace/kernels/opencl/winograd_transform.h
+++ b/mace/kernels/opencl/winograd_transform.h
--- a/mace/kernels/ops_register.cc
+++ b/mace/kernels/ops_register.cc
--- a/mace/ops/ops_register.h
+++ b/mace/ops/ops_register.h
--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
--- a/mace/kernels/pooling.cc
+++ b/mace/kernels/pooling.cc
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
--- a/mace/kernels/reduce_mean.h
+++ b/mace/kernels/reduce_mean.h
--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
--- a/mace/kernels/resize_bicubic.cc
+++ b/mace/kernels/resize_bicubic.cc
--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
--- a/mace/kernels/resize_bilinear.cc
+++ b/mace/kernels/resize_bilinear.cc
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
--- a/mace/kernels/reverse.h
+++ b/mace/kernels/reverse.h
--- a/mace/kernels/scalar_math.h
+++ b/mace/kernels/scalar_math.h
--- a/mace/ops/shape.h
+++ b/mace/ops/shape.h
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
--- a/mace/kernels/space_to_depth.h
+++ b/mace/kernels/space_to_depth.h
--- a/mace/kernels/split.h
+++ b/mace/kernels/split.h
--- a/mace/kernels/sqrdiff_mean.h
+++ b/mace/kernels/sqrdiff_mean.h
--- a/mace/ops/squeeze.h
+++ b/mace/ops/squeeze.h
--- a/mace/kernels/stack.h
+++ b/mace/kernels/stack.h
--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.h
--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
--- a/mace/kernels/unstack.h
+++ b/mace/kernels/unstack.h
--- a/mace/kernels/winograd_transform.cc
+++ b/mace/kernels/winograd_transform.cc
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
--- a/mace/ops/argmax.h
+++ b/mace/ops/argmax.h
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
--- a/mace/ops/bias_add.h
+++ b/mace/ops/bias_add.h
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
--- a/mace/ops/buffer_inverse_transform.cc
+++ b/mace/ops/buffer_inverse_transform.cc
--- a/mace/ops/buffer_inverse_transform.h
+++ b/mace/ops/buffer_inverse_transform.h
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
--- a/mace/ops/buffer_transform.h
+++ b/mace/ops/buffer_transform.h
--- a/mace/ops/cast.cc
+++ b/mace/ops/cast.cc
--- a/mace/ops/cast_test.cc
+++ b/mace/ops/cast_test.cc
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
--- a/mace/ops/crop.h
+++ b/mace/ops/crop.h
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
--- a/mace/ops/expand_dims.h
+++ b/mace/ops/expand_dims.h
--- a/mace/ops/expand_dims_test.cc
+++ b/mace/ops/expand_dims_test.cc
--- a/mace/ops/fill.cc
+++ b/mace/ops/fill.cc
--- a/mace/ops/fill.h
+++ b/mace/ops/fill.h
--- a/mace/ops/fill_test.cc
+++ b/mace/ops/fill_test.cc
--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
--- a/mace/ops/folded_batch_norm.h
+++ b/mace/ops/folded_batch_norm.h
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
--- a/mace/ops/gather.cc
+++ b/mace/ops/gather.cc
--- a/mace/ops/gather.h
+++ b/mace/ops/gather.h
--- a/mace/ops/gather_benchmark.cc
+++ b/mace/ops/gather_benchmark.cc
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
--- a/mace/ops/identity_test.cc
+++ b/mace/ops/identity_test.cc
--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
--- a/mace/ops/local_response_norm.h
+++ b/mace/ops/local_response_norm.h
--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
--- a/mace/ops/lstmcell.cc
+++ b/mace/ops/lstmcell.cc
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
--- a/mace/ops/lstmcell_test.cc
+++ b/mace/ops/lstmcell_test.cc
--- a/mace/ops/lstmcell_test_util.h
+++ b/mace/ops/lstmcell_test_util.h
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/ops/ops_def_register.cc
+++ b/mace/ops/ops_def_register.cc
--- a/mace/kernels/kernel.h
+++ b/mace/kernels/kernel.h
--- a/mace/ops/ops_register.cc
+++ b/mace/ops/ops_register.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
--- a/mace/ops/pad.h
+++ b/mace/ops/pad.h
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/ops/proposal.cc
+++ b/mace/ops/proposal.cc
--- a/mace/ops/proposal.h
+++ b/mace/ops/proposal.h
--- a/mace/ops/proposal_test.cc
+++ b/mace/ops/proposal_test.cc
--- a/mace/ops/quantize.cc
+++ b/mace/ops/quantize.cc
--- a/mace/ops/quantize.h
+++ b/mace/ops/quantize.h
--- a/mace/ops/quantize_test.cc
+++ b/mace/ops/quantize_test.cc
--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
--- a/mace/ops/reduce_mean.h
+++ b/mace/ops/reduce_mean.h
--- a/mace/ops/reduce_mean_benchmark.cc
+++ b/mace/ops/reduce_mean_benchmark.cc
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
--- a/mace/ops/reverse.cc
+++ b/mace/ops/reverse.cc
--- a/mace/ops/reverse.h
+++ b/mace/ops/reverse.h
--- a/mace/ops/reverse_benchmark.cc
+++ b/mace/ops/reverse_benchmark.cc
--- a/mace/ops/reverse_test.cc
+++ b/mace/ops/reverse_test.cc
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
--- a/mace/ops/scalar_math.h
+++ b/mace/ops/scalar_math.h
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
--- a/mace/ops/shape_test.cc
+++ b/mace/ops/shape_test.cc
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/softmax.h
+++ b/mace/ops/softmax.h
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
--- a/mace/ops/split.h
+++ b/mace/ops/split.h
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
--- a/mace/ops/sqrdiff_mean.h
+++ b/mace/ops/sqrdiff_mean.h
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
--- a/mace/ops/stack.h
+++ b/mace/ops/stack.h
--- a/mace/ops/stack_test.cc
+++ b/mace/ops/stack_test.cc
--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
--- a/mace/ops/strided_slice.h
+++ b/mace/ops/strided_slice.h
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
--- a/mace/ops/transpose_test.cc
+++ b/mace/ops/transpose_test.cc
--- a/mace/ops/unstack.cc
+++ b/mace/ops/unstack.cc
--- a/mace/ops/unstack.h
+++ b/mace/ops/unstack.h
--- a/mace/ops/unstack_test.cc
+++ b/mace/ops/unstack_test.cc
--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
--- a/mace/ops/winograd_inverse_transform.cc
+++ b/mace/ops/winograd_inverse_transform.cc
--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
--- a/mace/utils/status.cc
+++ b/mace/utils/status.cc
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h