diff --git a/docs/development/adding_a_new_op.md b/docs/development/adding_a_new_op.md
index 2bf0af810845070f77ac174bcbfb7ccfc8f40113..4a631dd7ddd2df38b75a20ceec29bbb2a60e207e 100644
--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -19,7 +19,7 @@ Define the new Op class in `mace/ops/my_custom_op.cc`.
  
 The structure of Op is like the following code.
 ```c++
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 
 namespace mace {
 namespace ops {
@@ -39,7 +39,7 @@ class MyCustomOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterMyCustomOp(OpRegistryBase *op_registry) {
+void RegisterMyCustomOp(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                    DeviceType::CPU, float);
 
@@ -63,14 +63,14 @@ namespace ops {
 
 ...
 
-extern void RegisterMyCustomOp(OpRegistryBase *op_registry);
+extern void RegisterMyCustomOp(OpRegistry *op_registry);
 
 ...
 
 }  // namespace ops
 
 
-OpRegistry::OpRegistry() : OpRegistryBase() {
+OpRegistry::OpRegistry() {
   // Keep in lexicographical order
 
   ...
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 0f9d76093077835b71623edd274801f277a07ae8..dc7344bed145dd0d1c1dea4673226a15d2a1e638 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -557,7 +557,7 @@ which will reduce the library size significantly. the final binary just link the
     }  // namespace ops
 
 
-    OpRegistry::OpRegistry() : OpRegistryBase() {
+    OpRegistry::OpRegistry() {
     // Just leave the ops used in your models
 
       ...
diff --git a/docs/user_guide/advanced_usage_cmake.rst b/docs/user_guide/advanced_usage_cmake.rst
index 7be5e2f227a6950ae83bc7bb9d218cd1fcb1a87d..23631b93d3de058fab4ce04b3aa2a3fb8bae19cc 100644
--- a/docs/user_guide/advanced_usage_cmake.rst
+++ b/docs/user_guide/advanced_usage_cmake.rst
@@ -370,12 +370,13 @@ the sample code show how to calculate the Top-1 accuracy with imagenet validatio
 Reduce Library Size
 -------------------
 
-Remove the registration of the ops unused for your models in the ``mace/ops/ops_register.cc``,
-which will reduce the library size significantly. the final binary just link the registered ops' code.
+Remove the registration of the ops and delegators unused for your models in the
+``mace/ops/registry/ops_registry.cc`` and ``mace/ops/registry/op_delegators_registry.cc``,
+which will reduce the library size significantly. the final binary just link the registered ops and delegators' code.
 
 .. code-block:: cpp
 
-    #include "mace/ops/ops_register.h"
+    #include "mace/ops/registry/registry.h"
 
     namespace mace {
     namespace ops {
@@ -386,12 +387,38 @@ which will reduce the library size significantly. the final binary just link the
     }  // namespace ops
 
 
-    OpRegistry::OpRegistry() : OpRegistryBase() {
+    void RegisterAllOps(OpRegistry *registry) {
     // Just leave the ops used in your models
 
       ...
 
-      ops::RegisterMyCustomOp(this);
+      ops::RegisterMyCustomOp(registry);
+
+      ...
+
+    }
+
+    }  // namespace mace
+
+.. code-block:: cpp
+
+    #include "mace/ops/registry/registry.h"
+
+    namespace mace {
+    namespace ops {
+    // Just leave the delegators used in your ops
+
+    ...
+
+    }  // namespace ops
+
+
+    void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
+    // Just leave the delegators used in your ops
+
+      ...
+
+      ops::RegisterMyCustomDelegator(registry);
 
       ...
 
diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel
index 971b2a271c389b11c61f37e1def1ce49b4537a2e..39fc3883d7acab01ecf26533ce4613b8b59f6208 100644
--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -26,6 +26,8 @@ cc_library(
     srcs = glob(
         [
             "*.cc",
+            "ops/*.cc",
+            "registry/*.cc",
             "runtime/cpu/*.cc",
         ],
         exclude = [
@@ -53,6 +55,8 @@ cc_library(
     hdrs = glob(
         [
             "*.h",
+            "ops/*.h",
+            "registry/*.h",
             "runtime/cpu/*.h",
         ],
         exclude = [
@@ -68,7 +72,7 @@ cc_library(
     ])) + if_hta_enabled(glob([
         "runtime/hexagon/*hta*.h",
     ])) + if_apu_enabled(glob([
-        "runtime/apu/*.h"
+        "runtime/apu/*.h",
     ])) + if_rpcmem_enabled([
         "rpcmem.h",
     ]),
diff --git a/mace/core/CMakeLists.txt b/mace/core/CMakeLists.txt
index 25ab20bff9167b3936f8fb2101c3c9165016ea46..775eca5d8699a93428ad2988d7b5b420b8fc1ac4 100644
--- a/mace/core/CMakeLists.txt
+++ b/mace/core/CMakeLists.txt
@@ -8,9 +8,16 @@ set(CORE_SRCS
   net.cc
   net_def_adapter.cc
   net_optimizer.cc
-  op_context.cc
-  operator.cc
+  ops/op_condition_builder.cc
+  ops/op_condition_context.cc
+  ops/op_construct_context.cc
+  ops/op_context.cc
+  ops/operator.cc
+  ops/op_init_context.cc
   quantize.cc
+  registry/op_delegator_registry.cc
+  registry/op_registration_info.cc
+  registry/ops_registry.cc
   runtime_failure_mock.cc
   types.cc
   workspace.cc
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 78d40dd7f57440055eea4c48c375071db2e6bf13..1e11654921d7dadfd30636c41c679630e95b5ea0 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/net.h"
+
 #include <algorithm>
 #include <limits>
 #include <set>
@@ -20,8 +22,9 @@
 
 #include "mace/core/future.h"
 #include "mace/core/memory_optimizer.h"
-#include "mace/core/net.h"
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_init_context.h"
+#include "mace/core/ops/op_context.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/public/mace.h"
 #include "mace/port/env.h"
 #include "mace/utils/conf_util.h"
@@ -33,7 +36,7 @@
 
 namespace mace {
 
-SerialNet::SerialNet(const OpRegistryBase *op_registry,
+SerialNet::SerialNet(const OpRegistry *op_registry,
                      const NetDef *net_def,
                      Workspace *ws,
                      Device *target_device,
diff --git a/mace/core/net.h b/mace/core/net.h
index 18ec5134549ddf2a9fa62139034bb051e0afd64e..f761af134cea5c5124c3574601bc8a00acd817d2 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -21,13 +21,14 @@
 #include <unordered_map>
 #include <sstream>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 
 namespace mace {
 
 class RunMetadata;
 class Workspace;
 class MemoryOptimizer;
+class OpRegistry;
 
 class NetBase {
  public:
@@ -44,7 +45,7 @@ class NetBase {
 
 class SerialNet : public NetBase {
  public:
-  SerialNet(const OpRegistryBase *op_registry,
+  SerialNet(const OpRegistry *op_registry,
             const NetDef *net_def,
             Workspace *ws,
             Device *target_device,
diff --git a/mace/core/net_def_adapter.cc b/mace/core/net_def_adapter.cc
index 205dcdbe47374b92082a102eeef84dfe149794f3..7aa084b3ea00f0a634c475d08884ba7664382e02 100644
--- a/mace/core/net_def_adapter.cc
+++ b/mace/core/net_def_adapter.cc
@@ -17,7 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/ops/op_condition_context.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_util.h"
@@ -82,7 +84,7 @@ void BuildTransposeOpDef(
 
 }  // namespace
 
-NetDefAdapter::NetDefAdapter(const OpRegistryBase *op_registry,
+NetDefAdapter::NetDefAdapter(const OpRegistry *op_registry,
                              const Workspace *ws)
     : op_registry_(op_registry), ws_(ws) {}
 
diff --git a/mace/core/net_def_adapter.h b/mace/core/net_def_adapter.h
index 0268329e4c7d2659492ea777a18606a82e8572bd..b285a4a5d913db3051e02fd93f8a50826e176d17 100644
--- a/mace/core/net_def_adapter.h
+++ b/mace/core/net_def_adapter.h
@@ -23,14 +23,17 @@
 #include "mace/core/types.h"
 #include "mace/proto/mace.pb.h"
 #include "mace/port/port.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/core/net_optimizer.h"
 
 namespace mace {
 
-class OpRegistryBase;
-class Workspace;
 class Device;
+class OpConditionContext;
+class OperatorDef;
+class OpRegistry;
+class Workspace;
+
 
 ///////////////////////////////////////////////////////////////////////////////
 ///                               Conventions
@@ -49,7 +52,7 @@ class Device;
 ///////////////////////////////////////////////////////////////////////////////
 class NetDefAdapter {
  public:
-  NetDefAdapter(const OpRegistryBase *op_registry,
+  NetDefAdapter(const OpRegistry *op_registry,
                 const Workspace *ws);
   // Adapt original net_def to a better net.
   // 1. Adapt device: choose best device for every op in the net.
@@ -122,7 +125,7 @@ class NetDefAdapter {
   std::string DebugString(const NetDef *net_def);
 
  private:
-  const OpRegistryBase *op_registry_;
+  const OpRegistry *op_registry_;
   const Workspace *ws_;
   NetOptimizer net_optimizer_;
 };
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
deleted file mode 100644
index a266ce2b4b4335146a56d80fac61d7229737c006..0000000000000000000000000000000000000000
--- a/mace/core/operator.cc
+++ /dev/null
@@ -1,371 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-#include <map>
-#include <memory>
-#include <vector>
-
-#include "mace/core/operator.h"
-
-namespace mace {
-OpConditionContext::OpConditionContext(
-    const Workspace *ws,
-    OpConditionContext::TensorShapeMap *info)
-    : operator_def_(nullptr),
-      ws_(ws),
-      device_(nullptr),
-      tensor_shape_info_(info) {}
-
-void OpConditionContext::set_operator_def(
-    const OperatorDef *operator_def) {
-  operator_def_ = operator_def;
-  input_data_types_.clear();
-}
-
-void OpConditionContext::SetInputInfo(size_t idx,
-                                      MemoryType mem_type,
-                                      DataType dt) {
-  if (input_mem_types_.empty()) {
-    // the default inputs' memory types are same as output memory type.
-    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
-  }
-  if (input_data_types_.empty()) {
-    // the default inputs' data types are same as operation's data type.
-    DataType op_dt = static_cast<DataType>(
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
-    input_data_types_.resize(operator_def_->input_size(), op_dt);
-  }
-  MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size());
-  input_mem_types_[idx] = mem_type;
-  input_data_types_[idx] = dt;
-}
-
-void OpConditionContext::set_output_mem_type(MemoryType type) {
-  MACE_CHECK(operator_def_ != nullptr);
-  output_mem_type_ = type;
-  input_mem_types_.clear();
-}
-
-MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
-  if (input_mem_types_.empty()) {
-    return output_mem_type_;
-  }
-  MACE_CHECK(idx < input_mem_types_.size(),
-             idx, " < ", input_mem_types_.size());
-  return input_mem_types_[idx];
-}
-
-DataType OpConditionContext::GetInputDataType(size_t idx) const {
-  if (input_data_types_.empty()) {
-    // the default inputs' data types are same as operation's data type.
-    return static_cast<DataType>(
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
-  }
-  MACE_CHECK(idx < input_data_types_.size());
-  return input_data_types_[idx];
-}
-
-#ifdef MACE_ENABLE_OPENCL
-void OpConditionContext::SetInputOpenCLBufferType(
-    size_t idx, OpenCLBufferType buffer_type) {
-  if (input_opencl_buffer_types_.empty()) {
-    // the default inputs' memory types are same as output memory type.
-    input_opencl_buffer_types_.resize(operator_def_->input_size(),
-                                      OpenCLBufferType::IN_OUT_CHANNEL);
-  }
-  MACE_CHECK(idx < input_opencl_buffer_types_.size());
-  input_opencl_buffer_types_[idx] = buffer_type;
-}
-OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
-    size_t idx) const {
-  if (input_opencl_buffer_types_.empty()) {
-    return OpenCLBufferType::IN_OUT_CHANNEL;
-  }
-  MACE_CHECK(idx < input_opencl_buffer_types_.size());
-  return input_opencl_buffer_types_[idx];
-}
-#endif  // MACE_ENABLE_OPENCL
-
-OpConstructContext::OpConstructContext(Workspace *ws)
-    : operator_def_(nullptr),
-      ws_(ws),
-      device_(nullptr) {}
-
-void OpConstructContext::set_operator_def(
-    std::shared_ptr<OperatorDef> operator_def) {
-  operator_def_ = operator_def;
-}
-
-OpInitContext::OpInitContext(Workspace *ws, Device *device)
-    : ws_(ws), device_(device) {}
-
-Operation::Operation(OpConstructContext *context)
-    : operator_def_(context->operator_def()) {}
-
-MaceStatus Operation::Init(OpInitContext *context) {
-  Workspace *ws = context->workspace();
-  for (const std::string &input_str : operator_def_->input()) {
-    const Tensor *tensor = ws->GetTensor(input_str);
-    MACE_CHECK(tensor != nullptr, "op ", operator_def_->type(),
-               ": Encountered a non-existing input tensor: ", input_str);
-    inputs_.push_back(tensor);
-  }
-  for (int i = 0; i < operator_def_->output_size(); ++i) {
-    const std::string output_str = operator_def_->output(i);
-    if (ws->HasTensor(output_str)) {
-      outputs_.push_back(ws->GetTensor(output_str));
-    } else {
-      MACE_CHECK(
-          operator_def_->output_type_size() == 0 ||
-              operator_def_->output_size() == operator_def_->output_type_size(),
-          "operator output size != operator output type size",
-          operator_def_->output_size(),
-          operator_def_->output_type_size());
-      DataType output_type;
-      if (i < operator_def_->output_type_size()) {
-        output_type = operator_def_->output_type(i);
-      } else {
-        output_type = static_cast<DataType>(
-            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                *operator_def_, "T", static_cast<int>(DT_FLOAT)));
-      }
-      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
-          output_str, context->device()->allocator(), output_type)));
-    }
-    if (i < operator_def_->output_shape_size()) {
-      std::vector<index_t>
-          shape_configured(operator_def_->output_shape(i).dims_size());
-      for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-        shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
-      }
-      ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
-    }
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
-
-// op registry
-namespace {
-class OpKeyBuilder {
- public:
-  explicit OpKeyBuilder(const std::string &op_name);
-
-  OpKeyBuilder &Device(DeviceType device);
-
-  OpKeyBuilder &TypeConstraint(const char *attr_name,
-                               DataType allowed);
-
-  const std::string Build();
-
- private:
-  std::string op_name_;
-  DeviceType device_type_;
-  std::map<std::string, DataType> type_constraint_;
-};
-
-OpKeyBuilder::OpKeyBuilder(const std::string &op_name) : op_name_(op_name) {}
-
-OpKeyBuilder &OpKeyBuilder::Device(DeviceType device) {
-  device_type_ = device;
-  return *this;
-}
-
-OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
-                                           DataType allowed) {
-  type_constraint_[attr_name] = allowed;
-  return *this;
-}
-
-const std::string OpKeyBuilder::Build() {
-  static const std::vector<std::string> type_order = {"T"};
-  std::stringstream ss;
-  ss << op_name_;
-  ss << device_type_;
-  for (auto type : type_order) {
-    ss << type << "_" << DataTypeToString(type_constraint_[type]);
-  }
-
-  return ss.str();
-}
-}  // namespace
-
-OpRegistrationInfo::OpRegistrationInfo() {
-  // default device type placer
-  device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
-    MACE_UNUSED(context);
-    return this->devices;
-  };
-
-  // default input and output memory type setter
-  memory_type_setter = [](OpConditionContext *context) -> void {
-    if (context->device()->device_type() == DeviceType::GPU) {
-#ifdef MACE_ENABLE_OPENCL
-      if (context->device()->gpu_runtime()->UseImageMemory()) {
-        context->set_output_mem_type(MemoryType::GPU_IMAGE);
-      } else {
-        context->set_output_mem_type(MemoryType::GPU_BUFFER);
-      }
-#endif  // MACE_ENABLE_OPENCL
-    } else {
-      context->set_output_mem_type(MemoryType::CPU_BUFFER);
-    }
-  };
-
-  data_format_selector = [](OpConditionContext *context)
-      -> std::vector<DataFormat> {
-    DataFormat op_data_format =
-        static_cast<DataFormat>(
-            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                *context->operator_def(), "data_format",
-                static_cast<int>(DataFormat::NONE)));
-    return std::vector<DataFormat>(context->operator_def()->input_size(),
-                                   op_data_format);
-  };
-}
-
-void OpRegistrationInfo::AddDevice(DeviceType device) {
-  devices.insert(device);
-}
-
-void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
-  VLOG(3) << "Registering: " << key;
-  MACE_CHECK(creators.count(key) == 0, "Key already registered: ", key);
-  creators[key] = creator;
-}
-
-MaceStatus OpRegistryBase::Register(
-    const std::string &op_type,
-    const DeviceType device_type,
-    const DataType dt,
-    OpRegistrationInfo::OpCreator creator) {
-  if (registry_.count(op_type) == 0) {
-    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
-        new OpRegistrationInfo);
-  }
-  registry_[op_type]->AddDevice(device_type);
-
-  std::string op_key = OpKeyBuilder(op_type)
-      .Device(device_type)
-      .TypeConstraint("T", dt)
-      .Build();
-  registry_.at(op_type)->Register(op_key, creator);
-  return MaceStatus::MACE_SUCCESS;
-}
-
-MaceStatus OpRegistryBase::Register(
-    const OpConditionBuilder &builder) {
-  std::string op_type = builder.type();
-  if (registry_.count(op_type) == 0) {
-    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
-        new OpRegistrationInfo);
-  }
-  builder.Finalize(registry_[op_type].get());
-  return MaceStatus::MACE_SUCCESS;
-}
-
-const std::set<DeviceType> OpRegistryBase::AvailableDevices(
-    const std::string &op_type, OpConditionContext *context) const {
-  MACE_CHECK(registry_.count(op_type) != 0,
-             op_type, " operation is not registered.");
-
-  return registry_.at(op_type)->device_placer(context);
-}
-
-void OpRegistryBase::GetInOutMemoryTypes(
-    const std::string &op_type,
-    OpConditionContext *context) const {
-  MACE_CHECK(registry_.count(op_type) != 0,
-             op_type, " operation is not registered. op_type=", op_type);
-  return registry_.at(op_type)->memory_type_setter(context);
-}
-
-const std::vector<DataFormat> OpRegistryBase::InputsDataFormat(
-    const std::string &op_type,
-    OpConditionContext *context) const {
-  MACE_CHECK(registry_.count(op_type) != 0,
-             op_type, " operation is not registered.");
-  return registry_.at(op_type)->data_format_selector(context);
-}
-
-std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
-    OpConstructContext *context,
-    DeviceType device_type) const {
-  auto operator_def = context->operator_def();
-  DataType dtype = static_cast<DataType>(
-      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-          *operator_def, "T", static_cast<int>(DT_FLOAT)));
-  VLOG(1) << "Creating operator " << operator_def->name() << "("
-          << operator_def->type() << "<" << dtype << ">" << ") on "
-          << device_type;
-  const std::string op_type = context->operator_def()->type();
-  MACE_CHECK(registry_.count(op_type) != 0,
-             op_type, " operation is not registered.");
-
-  auto key_dtype =
-      (device_type == DeviceType::GPU && dtype == DT_HALF) ? DT_FLOAT : dtype;
-  std::string key = OpKeyBuilder(op_type)
-      .Device(device_type)
-      .TypeConstraint("T", key_dtype)
-      .Build();
-  if (registry_.at(op_type)->creators.count(key) == 0) {
-    LOG(FATAL) << "Key not registered: " << key
-               << ", op type is: " << operator_def->type();
-  }
-  return registry_.at(op_type)->creators.at(key)(context);
-}
-
-OpConditionBuilder::OpConditionBuilder(const std::string &type)
-    : type_(type) {}
-
-const std::string OpConditionBuilder::type() const {
-  return type_;
-}
-
-OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
-    OpRegistrationInfo::DevicePlacer placer) {
-  placer_ = placer;
-  return *this;
-}
-
-OpConditionBuilder &OpConditionBuilder::SetInputMemoryTypeSetter(
-    OpRegistrationInfo::MemoryTypeSetter setter) {
-  memory_type_setter_ = setter;
-  return *this;
-}
-
-OpConditionBuilder &OpConditionBuilder::SetInputsDataFormatSelector(
-    OpRegistrationInfo::DataFormatSelector selector) {
-  data_format_selector_ = selector;
-  return *this;
-}
-
-void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
-  if (info != nullptr) {
-    if (placer_) {
-      info->device_placer = placer_;
-    }
-    if (memory_type_setter_) {
-      info->memory_type_setter = memory_type_setter_;
-    }
-
-    if (data_format_selector_) {
-      info->data_format_selector = data_format_selector_;
-    }
-  }
-}
-
-}  // namespace mace
diff --git a/mace/core/operator.h b/mace/core/operator.h
deleted file mode 100644
index fbcbfd2ead3f8d70552464420f450fae17b04b0a..0000000000000000000000000000000000000000
--- a/mace/core/operator.h
+++ /dev/null
@@ -1,358 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_OPERATOR_H_
-#define MACE_CORE_OPERATOR_H_
-
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "mace/core/arg_helper.h"
-#include "mace/core/op_context.h"
-#include "mace/core/tensor.h"
-#include "mace/core/workspace.h"
-#include "mace/proto/mace.pb.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_util.h"
-#endif  // MACE_ENABLE_OPENCL
-
-namespace mace {
-
-// OpConditionContext has all information used for choosing proper Op
-class OpConditionContext {
- public:
-  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
-  OpConditionContext(const Workspace *ws, TensorShapeMap *info);
-  ~OpConditionContext() = default;
-
-  void set_operator_def(const OperatorDef *operator_def);
-
-  inline const OperatorDef *operator_def() const {
-    return operator_def_;
-  }
-
-  inline const Workspace *workspace() const {
-    return ws_;
-  }
-
-  inline void set_device(Device *device) {
-    device_ = device;
-  }
-
-  inline Device *device() const {
-    return device_;
-  }
-
-  inline TensorShapeMap *tensor_shape_info() const {
-    return tensor_shape_info_;
-  }
-
-  void set_output_mem_type(MemoryType type);
-
-  inline MemoryType output_mem_type() const {
-    return output_mem_type_;
-  }
-
-  void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt);
-
-  MemoryType GetInputMemType(size_t idx) const;
-
-  DataType GetInputDataType(size_t idx) const;
-
-#ifdef MACE_ENABLE_OPENCL
-  void SetInputOpenCLBufferType(size_t idx, OpenCLBufferType buffer_type);
-  OpenCLBufferType GetInputOpenCLBufferType(size_t idx) const;
-#endif  // MACE_ENABLE_OPENCL
-
- private:
-  const OperatorDef *operator_def_;
-  const Workspace *ws_;
-  Device *device_;
-  TensorShapeMap *tensor_shape_info_;
-  // used for memory transform
-  std::vector<MemoryType> input_mem_types_;
-  std::vector<DataType> input_data_types_;
-  MemoryType output_mem_type_;  // there is only one output memory type now.
-#ifdef MACE_ENABLE_OPENCL
-  std::vector<OpenCLBufferType> input_opencl_buffer_types_;
-#endif  // MACE_ENABLE_OPENCL
-};
-
-// memory_optimizer, device
-class OpConstructContext {
-  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
-
- public:
-  explicit OpConstructContext(Workspace *ws);
-  ~OpConstructContext() = default;
-
-  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
-
-  inline std::shared_ptr<OperatorDef> operator_def() const {
-    return operator_def_;
-  }
-
-  inline Workspace *workspace() const {
-    return ws_;
-  }
-
-  inline void set_device(Device *device) {
-    device_ = device;
-  }
-
-  inline Device *device() const {
-    return device_;
-  }
-#ifdef MACE_ENABLE_OPENCL
-  inline MemoryType GetOpMemoryType() const {
-    return static_cast<MemoryType>(
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            *operator_def_, OutputMemoryTypeTagName(),
-            static_cast<int>(MemoryType::CPU_BUFFER)));
-  }
-#endif  // MACE_ENABLE_OPENCL
-
- private:
-  std::shared_ptr<OperatorDef> operator_def_;
-  Workspace *ws_;
-  Device *device_;
-};
-
-// memory_optimizer, device
-class OpInitContext {
- public:
-  explicit OpInitContext(Workspace *ws, Device *device = nullptr);
-  ~OpInitContext() = default;
-
-  inline Workspace *workspace() const {
-    return ws_;
-  }
-
-  inline void set_device(Device *device) {
-    device_ = device;
-  }
-
-  inline Device *device() const {
-    return device_;
-  }
-
- private:
-  Workspace *ws_;
-  Device *device_;
-};
-
-// Conventions
-// * If there exist format, NHWC is the default format
-// * The input/output format of CPU ops with float data type is NCHW
-// * The input/output format of GPU ops and CPU Quantization ops is NHWC
-// * Inputs' data type is same as the operation data type by default.
-// * The outputs' data type is same as the operation data type by default.
-class Operation {
- public:
-  explicit Operation(OpConstructContext *context);
-  virtual ~Operation() = default;
-
-  template<typename T>
-  inline T GetOptionalArg(const std::string &name,
-                          const T &default_value) const {
-    MACE_CHECK(operator_def_, "operator_def was null!");
-    return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
-        *operator_def_, name, default_value);
-  }
-  template<typename T>
-  inline std::vector<T> GetRepeatedArgs(
-      const std::string &name, const std::vector<T> &default_value = {}) const {
-    MACE_CHECK(operator_def_, "operator_def was null!");
-    return ProtoArgHelper::GetRepeatedArgs<OperatorDef, T>(
-        *operator_def_, name, default_value);
-  }
-
-  inline DeviceType device_type() const {
-    return static_cast<DeviceType>(operator_def_->device_type());
-  }
-
-  inline const Tensor *Input(unsigned int idx) {
-    MACE_CHECK(idx < inputs_.size());
-    return inputs_[idx];
-  }
-
-  inline Tensor *Output(int idx) { return outputs_[idx]; }
-
-  inline int InputSize() { return inputs_.size(); }
-  inline int OutputSize() { return outputs_.size(); }
-  inline const std::vector<const Tensor *> &Inputs() const { return inputs_; }
-  inline const std::vector<Tensor *> &Outputs() { return outputs_; }
-
-  // Run Op asynchronously (depends on device), return a future if not nullptr.
-  virtual MaceStatus Init(OpInitContext *);
-  virtual MaceStatus Run(OpContext *) = 0;
-
-  inline const OperatorDef &debug_def() const {
-    MACE_CHECK(has_debug_def(), "operator_def was null!");
-    return *operator_def_;
-  }
-
-  inline void set_debug_def(
-      const std::shared_ptr<OperatorDef> &operator_def) {
-    operator_def_ = operator_def;
-  }
-
-  inline bool has_debug_def() const { return operator_def_ != nullptr; }
-
-  inline std::shared_ptr<OperatorDef> operator_def() {
-    return operator_def_;
-  }
-
- protected:
-  std::shared_ptr<OperatorDef> operator_def_;
-  std::vector<const Tensor *> inputs_;
-  std::vector<Tensor *> outputs_;
-
-  MACE_DISABLE_COPY_AND_ASSIGN(Operation);
-};
-
-// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
-// indices of the operator's inputs and outputs, in order to avoid confusion.
-// For example, for a fully convolution layer that has input, weight and bias,
-// you can define its input tags as:
-//     MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
-// And in the code, instead of doing
-//     auto& weight = Input(1);
-// you can now do
-//     auto& weight = Input(WEIGHT);
-// to make it more clear.
-#define MACE_OP_INPUT_TAGS(first_input, ...) \
-  enum _InputTags { first_input = 0, __VA_ARGS__ }
-#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
-  enum _OutputTags { first_input = 0, __VA_ARGS__ }
-
-struct OpRegistrationInfo {
- public:
-  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
-      OpCreator;
-  typedef std::function<std::set<DeviceType>(OpConditionContext *)>
-      DevicePlacer;
-  typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
-  typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
-      DataFormatSelector;
-
-  OpRegistrationInfo();
-
-  void AddDevice(DeviceType);
-
-  void Register(const std::string &key, OpCreator creator);
-
-  std::set<DeviceType> devices;
-  std::unordered_map<std::string, OpCreator> creators;
-  DevicePlacer device_placer;
-  MemoryTypeSetter memory_type_setter;
-  DataFormatSelector data_format_selector;
-};
-
-class OpConditionBuilder {
- public:
-  explicit OpConditionBuilder(const std::string &type);
-
-  const std::string type() const;
-
-  OpConditionBuilder &SetDevicePlacerFunc(
-      OpRegistrationInfo::DevicePlacer placer);
-
-  // If you set input memory type for specified Op,
-  // you must call OpConditionContext::set_output_mem_type
-  OpConditionBuilder &SetInputMemoryTypeSetter(
-      OpRegistrationInfo::MemoryTypeSetter setter);
-
-  OpConditionBuilder &SetInputsDataFormatSelector(
-      OpRegistrationInfo::DataFormatSelector selector);
-
-  void Finalize(OpRegistrationInfo *info) const;
-
- private:
-  std::string type_;
-  OpRegistrationInfo::DevicePlacer placer_;
-  OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
-  OpRegistrationInfo::DataFormatSelector data_format_selector_;
-};
-
-class OpRegistryBase {
- public:
-  OpRegistryBase() = default;
-  virtual ~OpRegistryBase() = default;
-  MaceStatus Register(const std::string &op_type,
-                      const DeviceType device_type,
-                      const DataType dt,
-                      OpRegistrationInfo::OpCreator creator);
-
-  MaceStatus Register(const OpConditionBuilder &builder);
-
-  const std::set<DeviceType> AvailableDevices(
-      const std::string &op_type, OpConditionContext *context) const;
-
-  void GetInOutMemoryTypes(
-      const std::string &op_type, OpConditionContext *context) const;
-
-  const std::vector<DataFormat> InputsDataFormat(
-      const std::string &op_type, OpConditionContext *context) const;
-
-  std::unique_ptr<Operation> CreateOperation(
-      OpConstructContext *context,
-      DeviceType device_type) const;
-
-  template<class DerivedType>
-  static std::unique_ptr<Operation> DefaultCreator(
-      OpConstructContext *context) {
-    return std::unique_ptr<Operation>(new DerivedType(context));
-  }
-
- private:
-  std::unordered_map<
-      std::string,
-      std::unique_ptr<OpRegistrationInfo>> registry_;
-  MACE_DISABLE_COPY_AND_ASSIGN(OpRegistryBase);
-};
-
-#define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
-  op_registry->Register(op_type,                                       \
-                        device,                                        \
-                        DataTypeToEnum<dt>::value,                     \
-                        OpRegistryBase::DefaultCreator<class_name<device, dt>>)
-
-#define MACE_REGISTER_OP_BY_CLASS(                 \
-    op_registry, op_type, class_name, device, dt)  \
-  op_registry->Register(op_type,                   \
-                        device,                    \
-                        DataTypeToEnum<dt>::value, \
-                        OpRegistryBase::DefaultCreator<class_name>)
-
-#ifdef MACE_ENABLE_OPENCL
-#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
-  op_registry->Register(                                       \
-      op_type,                                                 \
-      DeviceType::GPU,                                         \
-      DT_FLOAT,                                                \
-      OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
-#else
-#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
-#endif
-
-#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
-  op_registry->Register(builder)
-
-}  // namespace mace
-
-#endif  // MACE_CORE_OPERATOR_H_
diff --git a/mace/core/ops/op_condition_builder.cc b/mace/core/ops/op_condition_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f226e3620f9b9988e83865a2f6f73aa06daef77
--- /dev/null
+++ b/mace/core/ops/op_condition_builder.cc
@@ -0,0 +1,59 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/ops/op_condition_builder.h"
+
+namespace mace {
+
+OpConditionBuilder::OpConditionBuilder(const std::string &type)
+    : type_(type) {}
+
+const std::string OpConditionBuilder::type() const {
+  return type_;
+}
+
+OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
+    OpRegistrationInfo::DevicePlacer placer) {
+  placer_ = placer;
+  return *this;
+}
+
+OpConditionBuilder &OpConditionBuilder::SetInputMemoryTypeSetter(
+    OpRegistrationInfo::MemoryTypeSetter setter) {
+  memory_type_setter_ = setter;
+  return *this;
+}
+
+OpConditionBuilder &OpConditionBuilder::SetInputsDataFormatSelector(
+    OpRegistrationInfo::DataFormatSelector selector) {
+  data_format_selector_ = selector;
+  return *this;
+}
+
+void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
+  if (info != nullptr) {
+    if (placer_) {
+      info->device_placer = placer_;
+    }
+    if (memory_type_setter_) {
+      info->memory_type_setter = memory_type_setter_;
+    }
+
+    if (data_format_selector_) {
+      info->data_format_selector = data_format_selector_;
+    }
+  }
+}
+
+}  // namespace mace
diff --git a/mace/core/ops/op_condition_builder.h b/mace/core/ops/op_condition_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..1378e00ec69f56cd32b1b4645346f2a510f104bc
--- /dev/null
+++ b/mace/core/ops/op_condition_builder.h
@@ -0,0 +1,53 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OPS_OP_CONDITION_BUILDER_H_
+#define MACE_CORE_OPS_OP_CONDITION_BUILDER_H_
+
+#include <memory>
+#include <string>
+
+#include "mace/core/registry/op_registration_info.h"
+#include "mace/core/types.h"
+
+namespace mace {
+class OpConditionBuilder {
+ public:
+  explicit OpConditionBuilder(const std::string &type);
+
+  const std::string type() const;
+
+  OpConditionBuilder &SetDevicePlacerFunc(
+      OpRegistrationInfo::DevicePlacer placer);
+
+  // If you set input memory type for specified Op,
+  // you must call OpConditionContext::set_output_mem_type
+  OpConditionBuilder &SetInputMemoryTypeSetter(
+      OpRegistrationInfo::MemoryTypeSetter setter);
+
+  OpConditionBuilder &SetInputsDataFormatSelector(
+      OpRegistrationInfo::DataFormatSelector selector);
+
+  void Finalize(OpRegistrationInfo *info) const;
+
+ private:
+  std::string type_;
+  OpRegistrationInfo::DevicePlacer placer_;
+  OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
+  OpRegistrationInfo::DataFormatSelector data_format_selector_;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_OPS_OP_CONDITION_BUILDER_H_
diff --git a/mace/core/ops/op_condition_context.cc b/mace/core/ops/op_condition_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb094a8cf2889a1e926363b2a3de24884cad7a98
--- /dev/null
+++ b/mace/core/ops/op_condition_context.cc
@@ -0,0 +1,104 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/ops/op_condition_context.h"
+
+#include "mace/core/arg_helper.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+OpConditionContext::OpConditionContext(
+    const Workspace *ws,
+    OpConditionContext::TensorShapeMap *info)
+    : operator_def_(nullptr),
+      ws_(ws),
+      device_(nullptr),
+      tensor_shape_info_(info) {}
+
+void OpConditionContext::set_operator_def(
+    const OperatorDef *operator_def) {
+  operator_def_ = operator_def;
+  input_data_types_.clear();
+}
+
+void OpConditionContext::SetInputInfo(size_t idx,
+                                      MemoryType mem_type,
+                                      DataType dt) {
+  if (input_mem_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
+  }
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    DataType op_dt = static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+    input_data_types_.resize(operator_def_->input_size(), op_dt);
+  }
+  MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size());
+  input_mem_types_[idx] = mem_type;
+  input_data_types_[idx] = dt;
+}
+
+void OpConditionContext::set_output_mem_type(MemoryType type) {
+  MACE_CHECK(operator_def_ != nullptr);
+  output_mem_type_ = type;
+  input_mem_types_.clear();
+}
+
+MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
+  if (input_mem_types_.empty()) {
+    return output_mem_type_;
+  }
+  MACE_CHECK(idx < input_mem_types_.size(),
+             idx, " < ", input_mem_types_.size());
+  return input_mem_types_[idx];
+}
+
+DataType OpConditionContext::GetInputDataType(size_t idx) const {
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    return static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+  }
+  MACE_CHECK(idx < input_data_types_.size());
+  return input_data_types_[idx];
+}
+
+#ifdef MACE_ENABLE_OPENCL
+void OpConditionContext::SetInputOpenCLBufferType(
+    size_t idx, OpenCLBufferType buffer_type) {
+  if (input_opencl_buffer_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_opencl_buffer_types_.resize(operator_def_->input_size(),
+                                      OpenCLBufferType::IN_OUT_CHANNEL);
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  input_opencl_buffer_types_[idx] = buffer_type;
+}
+
+OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
+    size_t idx) const {
+  if (input_opencl_buffer_types_.empty()) {
+    return OpenCLBufferType::IN_OUT_CHANNEL;
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  return input_opencl_buffer_types_[idx];
+}
+#endif  // MACE_ENABLE_OPENCL
+
+}  // namespace mace
diff --git a/mace/core/ops/op_condition_context.h b/mace/core/ops/op_condition_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e1c882e2cc7f94bab2d4266e5365b99b916aa19
--- /dev/null
+++ b/mace/core/ops/op_condition_context.h
@@ -0,0 +1,94 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OPS_OP_CONDITION_CONTEXT_H_
+#define MACE_CORE_OPS_OP_CONDITION_CONTEXT_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "mace/core/types.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+class Workspace;
+class Device;
+
+// OpConditionContext has all information used for choosing proper Op
+class OpConditionContext {
+ public:
+  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
+  OpConditionContext(const Workspace *ws, TensorShapeMap *info);
+  ~OpConditionContext() = default;
+
+  void set_operator_def(const OperatorDef *operator_def);
+
+  const OperatorDef *operator_def() const {
+    return operator_def_;
+  }
+
+  const Workspace *workspace() const {
+    return ws_;
+  }
+
+  void set_device(Device *device) {
+    device_ = device;
+  }
+
+  Device *device() const {
+    return device_;
+  }
+
+  TensorShapeMap *tensor_shape_info() const {
+    return tensor_shape_info_;
+  }
+
+  void set_output_mem_type(MemoryType type);
+
+  MemoryType output_mem_type() const {
+    return output_mem_type_;
+  }
+
+  void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt);
+
+  MemoryType GetInputMemType(size_t idx) const;
+
+  DataType GetInputDataType(size_t idx) const;
+
+#ifdef MACE_ENABLE_OPENCL
+  void SetInputOpenCLBufferType(size_t idx, OpenCLBufferType buffer_type);
+  OpenCLBufferType GetInputOpenCLBufferType(size_t idx) const;
+#endif  // MACE_ENABLE_OPENCL
+
+ private:
+  const OperatorDef *operator_def_;
+  const Workspace *ws_;
+  Device *device_;
+  TensorShapeMap *tensor_shape_info_;
+  // used for memory transform
+  std::vector<MemoryType> input_mem_types_;
+  std::vector<DataType> input_data_types_;
+  MemoryType output_mem_type_;  // there is only one output memory type now.
+#ifdef MACE_ENABLE_OPENCL
+  std::vector<OpenCLBufferType> input_opencl_buffer_types_;
+#endif  // MACE_ENABLE_OPENCL
+};
+}  // namespace mace
+
+#endif  // MACE_CORE_OPS_OP_CONDITION_CONTEXT_H_
diff --git a/mace/core/ops/op_construct_context.cc b/mace/core/ops/op_construct_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc701259bf7f397dc5b85e4fba36d54f0a2a1036
--- /dev/null
+++ b/mace/core/ops/op_construct_context.cc
@@ -0,0 +1,29 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/ops/op_construct_context.h"
+
+namespace mace {
+
+OpConstructContext::OpConstructContext(Workspace *ws)
+    : operator_def_(nullptr),
+      ws_(ws),
+      device_(nullptr) {}
+
+void OpConstructContext::set_operator_def(
+    std::shared_ptr<OperatorDef> operator_def) {
+  operator_def_ = operator_def;
+}
+
+}  // namespace mace
diff --git a/mace/core/ops/op_construct_context.h b/mace/core/ops/op_construct_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bd4709da3359a57f24155d4c394e15ae8951d0e
--- /dev/null
+++ b/mace/core/ops/op_construct_context.h
@@ -0,0 +1,73 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OPS_OP_CONSTRUCT_CONTEXT_H_
+#define MACE_CORE_OPS_OP_CONSTRUCT_CONTEXT_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "mace/core/arg_helper.h"
+#include "mace/core/types.h"
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+class Device;
+class Workspace;
+
+// memory_optimizer, device
+class OpConstructContext {
+  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
+
+ public:
+  explicit OpConstructContext(Workspace *ws);
+  ~OpConstructContext() = default;
+
+  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
+
+  std::shared_ptr<OperatorDef> operator_def() const {
+    return operator_def_;
+  }
+
+  Workspace *workspace() const {
+    return ws_;
+  }
+
+  void set_device(Device *device) {
+    device_ = device;
+  }
+
+  Device *device() const {
+    return device_;
+  }
+#ifdef MACE_ENABLE_OPENCL
+  inline MemoryType GetOpMemoryType() const {
+    return static_cast<MemoryType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, OutputMemoryTypeTagName(),
+            static_cast<int>(MemoryType::CPU_BUFFER)));
+  }
+#endif  // MACE_ENABLE_OPENCL
+
+ private:
+  std::shared_ptr<OperatorDef> operator_def_;
+  Workspace *ws_;
+  Device *device_;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_OPS_OP_CONSTRUCT_CONTEXT_H_
diff --git a/mace/core/op_context.cc b/mace/core/ops/op_context.cc
similarity index 96%
rename from mace/core/op_context.cc
rename to mace/core/ops/op_context.cc
index d0ebeff7a733ed95f0d47275427bb130cb8bc446..641609952cb250fd6827c1e762c1fac75af85dda 100644
--- a/mace/core/op_context.cc
+++ b/mace/core/ops/op_context.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 
 namespace mace {
 
diff --git a/mace/core/op_context.h b/mace/core/ops/op_context.h
similarity index 90%
rename from mace/core/op_context.h
rename to mace/core/ops/op_context.h
index 26a31dc3c1d5df9f3665114d463890b029b598d1..062254793f30d9ac2d4db6bede9a1d103eafd6b8 100644
--- a/mace/core/op_context.h
+++ b/mace/core/ops/op_context.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_CORE_OP_CONTEXT_H_
-#define MACE_CORE_OP_CONTEXT_H_
+#ifndef MACE_CORE_OPS_OP_CONTEXT_H_
+#define MACE_CORE_OPS_OP_CONTEXT_H_
 
 #include "mace/core/device.h"
 #include "mace/core/workspace.h"
@@ -35,8 +35,7 @@ class OpContext {
   Device *device_;
   Workspace *ws_;
   StatsFuture *future_;
-  // metadata
 };
 
 }  // namespace mace
-#endif  // MACE_CORE_OP_CONTEXT_H_
+#endif  // MACE_CORE_OPS_OP_CONTEXT_H_
diff --git a/mace/core/ops/op_delegator.h b/mace/core/ops/op_delegator.h
new file mode 100644
index 0000000000000000000000000000000000000000..029bd39f814e8b69507a0a2db162732885fb2acd
--- /dev/null
+++ b/mace/core/ops/op_delegator.h
@@ -0,0 +1,58 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OPS_OP_DELEGATOR_H_
+#define MACE_CORE_OPS_OP_DELEGATOR_H_
+
+#include <memory>
+
+#include "mace/utils/macros.h"
+#include "mace/utils/memory.h"
+
+namespace mace {
+
+enum ImplType {
+  REF = 0,
+  NEON,
+};
+
+#ifdef MACE_ENABLE_NEON
+#define MACE_CPU_IMPL_TYPE NEON
+#else
+#define MACE_CPU_IMPL_TYPE REF
+#endif
+
+struct DelegatorParam {
+ public:
+  DelegatorParam() = default;
+  virtual ~DelegatorParam() = default;
+};
+
+class OpDelegator {
+ public:
+  explicit OpDelegator(const DelegatorParam &param) {
+    MACE_UNUSED(param);
+  }
+  virtual ~OpDelegator() = default;
+
+  template<class DerivedType, class ParamType>
+  static std::unique_ptr<OpDelegator> DefaultCreator(
+      const DelegatorParam &param) {
+    return make_unique<DerivedType>(static_cast<const ParamType &>(param));
+  }
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_OPS_OP_DELEGATOR_H_
diff --git a/mace/core/ops/op_init_context.cc b/mace/core/ops/op_init_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b3cee2505da7e2b66279b1d45c2b108d611ce5b
--- /dev/null
+++ b/mace/core/ops/op_init_context.cc
@@ -0,0 +1,22 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/ops/op_init_context.h"
+
+namespace mace {
+
+OpInitContext::OpInitContext(Workspace *ws, Device *device)
+    : ws_(ws), device_(device) {}
+
+}  // namespace mace
diff --git a/mace/core/ops/op_init_context.h b/mace/core/ops/op_init_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..da51cc23536b016f8ba9f256adc6438c1fa0c100
--- /dev/null
+++ b/mace/core/ops/op_init_context.h
@@ -0,0 +1,47 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OPS_OP_INIT_CONTEXT_H_
+#define MACE_CORE_OPS_OP_INIT_CONTEXT_H_
+
+namespace mace {
+class Workspace;
+class Device;
+
+// memory_optimizer, device
+class OpInitContext {
+ public:
+  explicit OpInitContext(Workspace *ws, Device *device = nullptr);
+  ~OpInitContext() = default;
+
+  Workspace *workspace() const {
+    return ws_;
+  }
+
+  void set_device(Device *device) {
+    device_ = device;
+  }
+
+  Device *device() const {
+    return device_;
+  }
+
+ private:
+  Workspace *ws_;
+  Device *device_;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_OPS_OP_INIT_CONTEXT_H_
diff --git a/mace/core/ops/operator.cc b/mace/core/ops/operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5aa102d91717723e3db1492a3a6f195349961e03
--- /dev/null
+++ b/mace/core/ops/operator.cc
@@ -0,0 +1,68 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/ops/operator.h"
+
+#include <vector>
+
+#include "mace/core/ops/op_construct_context.h"
+#include "mace/core/ops/op_init_context.h"
+
+namespace mace {
+Operation::Operation(OpConstructContext *context)
+    : operator_def_(context->operator_def()) {}
+
+MaceStatus Operation::Init(OpInitContext *context) {
+  Workspace *ws = context->workspace();
+  for (const std::string &input_str : operator_def_->input()) {
+    const Tensor *tensor = ws->GetTensor(input_str);
+    MACE_CHECK(tensor != nullptr, "op ", operator_def_->type(),
+               ": Encountered a non-existing input tensor: ", input_str);
+    inputs_.push_back(tensor);
+  }
+  for (int i = 0; i < operator_def_->output_size(); ++i) {
+    const std::string output_str = operator_def_->output(i);
+    if (ws->HasTensor(output_str)) {
+      outputs_.push_back(ws->GetTensor(output_str));
+    } else {
+      MACE_CHECK(
+          operator_def_->output_type_size() == 0 ||
+              operator_def_->output_size() == operator_def_->output_type_size(),
+          "operator output size != operator output type size",
+          operator_def_->output_size(),
+          operator_def_->output_type_size());
+      DataType output_type;
+      if (i < operator_def_->output_type_size()) {
+        output_type = operator_def_->output_type(i);
+      } else {
+        output_type = static_cast<DataType>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                *operator_def_, "T", static_cast<int>(DT_FLOAT)));
+      }
+      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
+          output_str, context->device()->allocator(), output_type)));
+    }
+    if (i < operator_def_->output_shape_size()) {
+      std::vector<index_t>
+          shape_configured(operator_def_->output_shape(i).dims_size());
+      for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+        shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
+      }
+      ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace mace
diff --git a/mace/core/ops/operator.h b/mace/core/ops/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb4a20d554fa7159cec1f022252cf9f6870f5fa0
--- /dev/null
+++ b/mace/core/ops/operator.h
@@ -0,0 +1,120 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OPS_OPERATOR_H_
+#define MACE_CORE_OPS_OPERATOR_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/arg_helper.h"
+#include "mace/core/ops/op_construct_context.h"
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/proto/mace.pb.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+class OpInitContext;
+// Conventions
+// * If there exist format, NHWC is the default format
+// * The input/output format of CPU ops with float data type is NCHW
+// * The input/output format of GPU ops and CPU Quantization ops is NHWC
+// * Inputs' data type is same as the operation data type by default.
+// * The outputs' data type is same as the operation data type by default.
+class Operation {
+ public:
+  explicit Operation(OpConstructContext *context);
+  virtual ~Operation() = default;
+
+  template<typename T>
+  T GetOptionalArg(const std::string &name,
+                   const T &default_value) const {
+    MACE_CHECK(operator_def_, "operator_def was null!");
+    return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
+        *operator_def_, name, default_value);
+  }
+  template<typename T>
+  std::vector<T> GetRepeatedArgs(
+      const std::string &name, const std::vector<T> &default_value = {}) const {
+    MACE_CHECK(operator_def_, "operator_def was null!");
+    return ProtoArgHelper::GetRepeatedArgs<OperatorDef, T>(
+        *operator_def_, name, default_value);
+  }
+
+  DeviceType device_type() const {
+    return static_cast<DeviceType>(operator_def_->device_type());
+  }
+
+  const Tensor *Input(unsigned int idx) {
+    MACE_CHECK(idx < inputs_.size());
+    return inputs_[idx];
+  }
+
+  Tensor *Output(int idx) { return outputs_[idx]; }
+
+  int InputSize() { return inputs_.size(); }
+  int OutputSize() { return outputs_.size(); }
+  const std::vector<const Tensor *> &Inputs() const { return inputs_; }
+  const std::vector<Tensor *> &Outputs() { return outputs_; }
+
+  // Run Op asynchronously (depends on device), return a future if not nullptr.
+  virtual MaceStatus Init(OpInitContext *);
+  virtual MaceStatus Run(OpContext *) = 0;
+
+  const OperatorDef &debug_def() const {
+    MACE_CHECK(has_debug_def(), "operator_def was null!");
+    return *operator_def_;
+  }
+
+  void set_debug_def(
+      const std::shared_ptr<OperatorDef> &operator_def) {
+    operator_def_ = operator_def;
+  }
+
+  bool has_debug_def() const { return operator_def_ != nullptr; }
+
+  inline std::shared_ptr<OperatorDef> operator_def() {
+    return operator_def_;
+  }
+
+ protected:
+  std::shared_ptr<OperatorDef> operator_def_;
+  std::vector<const Tensor *> inputs_;
+  std::vector<Tensor *> outputs_;
+
+  MACE_DISABLE_COPY_AND_ASSIGN(Operation);
+};
+
+// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
+// indices of the operator's inputs and outputs, in order to avoid confusion.
+// For example, for a fully convolution layer that has input, weight and bias,
+// you can define its input tags as:
+//     MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
+// And in the code, instead of doing
+//     auto& weight = Input(1);
+// you can now do
+//     auto& weight = Input(WEIGHT);
+// to make it more clear.
+#define MACE_OP_INPUT_TAGS(first_input, ...) \
+  enum _InputTags { first_input = 0, __VA_ARGS__ }
+#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
+  enum _OutputTags { first_input = 0, __VA_ARGS__ }
+
+}  // namespace mace
+
+#endif  // MACE_CORE_OPS_OPERATOR_H_
diff --git a/mace/core/registry/op_delegator_registry.cc b/mace/core/registry/op_delegator_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..006f5555f8710ddd667166c182088b86de6e2af5
--- /dev/null
+++ b/mace/core/registry/op_delegator_registry.cc
@@ -0,0 +1,39 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/registry/op_delegator_registry.h"
+
+#include <utility>
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+MaceStatus OpDelegatorRegistry::Register(const std::string &key,
+                                         DelegatorCreator creator) {
+  MACE_CHECK(registry_.count(key) == 0, "Register an exist key.");
+  registry_[key] = std::move(creator);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+DelegatorCreator OpDelegatorRegistry::GetCreator(const std::string &key) const {
+  MACE_CHECK(registry_.count(key) > 0, key, " not exist.");
+  return registry_.at(key);
+}
+
+template<> const char *DType<float>::name_ = "float";
+template<> const char *DType<int>::name_ = "int";
+template<> const char *DType<uint8_t>::name_ = "uint8_t";
+
+}  // namespace mace
diff --git a/mace/core/registry/op_delegator_registry.h b/mace/core/registry/op_delegator_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..f70d5555792b19419d48c84fd06ad9f422096d95
--- /dev/null
+++ b/mace/core/registry/op_delegator_registry.h
@@ -0,0 +1,94 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_REGISTRY_OP_DELEGATOR_REGISTRY_H_
+#define MACE_CORE_REGISTRY_OP_DELEGATOR_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "mace/core/ops/op_delegator.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+typedef std::function<std::unique_ptr<OpDelegator>(const DelegatorParam &)>
+    DelegatorCreator;
+
+class OpDelegatorRegistry {
+ public:
+  OpDelegatorRegistry() = default;
+  ~OpDelegatorRegistry() = default;
+
+  MaceStatus Register(const std::string &key, DelegatorCreator creator);
+  DelegatorCreator GetCreator(const std::string &key) const;
+
+ private:
+  std::unordered_map<std::string, DelegatorCreator> registry_;
+};
+
+template<typename T>
+struct DType { static const char *name_; };
+template<> const char *DType<float>::name_;
+template<> const char *DType<int>::name_;
+template<> const char *DType<uint8_t>::name_;
+
+
+}  // namespace mace
+
+#ifndef MACE_DELEGATOR_KEY_TMP
+#define MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl) \
+  (std::string(#delegator_name"_"#device"_"#impl"_") + DType<DT>::name_)
+#endif  // MACE_DELEGATOR_KEY_TMP
+
+#ifndef MACE_DELEGATOR_KEY
+#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
+  MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl)
+#endif  // MACE_DELEGATOR_KEY
+
+#ifndef MACE_DELEGATOR_KEY_EX_TMP
+#define MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) \
+  (std::string(#delegator_name"_"#device"_"#impl"_"#tag"_") + DType<DT>::name_)
+#endif  // MACE_DELEGATOR_KEY_EX_TMP
+
+#ifndef MACE_DELEGATOR_KEY_EX
+#define MACE_DELEGATOR_KEY_EX(delegator_name, device, DT, impl, tag) \
+  MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag)
+#endif  // MACE_DELEGATOR_KEY_EX
+
+#ifndef MACE_REGISTER_DELEGATOR
+#define MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key)  \
+  void Register##class_name##Delegator(OpDelegatorRegistry *registry) { \
+    registry->Register(                                                 \
+        key, OpDelegator::DefaultCreator<class_name, param_name>);      \
+  }
+#endif  // MACE_REGISTER_DELEGATOR
+
+#ifndef MACE_DEFINE_DELEGATOR_CREATOR
+#define MACE_DEFINE_DELEGATOR_CREATOR(class_name)            \
+  static std::unique_ptr<class_name> Create(                 \
+      Workspace *workspace, const std::string &tag,          \
+      const DelegatorParam &param) {                         \
+    DelegatorCreator creator =                               \
+        workspace->GetDelegatorRegistry()->GetCreator(tag);  \
+    std::unique_ptr<OpDelegator> delegator = creator(param); \
+    return  std::unique_ptr<class_name>(                     \
+        static_cast<class_name *>(delegator.release()));     \
+  }
+#endif  // MACE_DEFINE_DELEGATOR_CREATOR
+
+#endif  // MACE_CORE_REGISTRY_OP_DELEGATOR_REGISTRY_H_
diff --git a/mace/core/registry/op_registration_info.cc b/mace/core/registry/op_registration_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e043897648ad644f56ad0b402698783862535630
--- /dev/null
+++ b/mace/core/registry/op_registration_info.cc
@@ -0,0 +1,69 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/registry/op_registration_info.h"
+
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mace/core/ops/op_condition_context.h"
+
+namespace mace {
+OpRegistrationInfo::OpRegistrationInfo() {
+  // default device type placer
+  device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
+    MACE_UNUSED(context);
+    return this->devices;
+  };
+
+  // default input and output memory type setter
+  memory_type_setter = [](OpConditionContext *context) -> void {
+    if (context->device()->device_type() == DeviceType::GPU) {
+#ifdef MACE_ENABLE_OPENCL
+      if (context->device()->gpu_runtime()->UseImageMemory()) {
+        context->set_output_mem_type(MemoryType::GPU_IMAGE);
+      } else {
+        context->set_output_mem_type(MemoryType::GPU_BUFFER);
+      }
+#endif  // MACE_ENABLE_OPENCL
+    } else {
+      context->set_output_mem_type(MemoryType::CPU_BUFFER);
+    }
+  };
+
+  data_format_selector = [](OpConditionContext *context)
+      -> std::vector<DataFormat> {
+    DataFormat op_data_format =
+        static_cast<DataFormat>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                *context->operator_def(), "data_format",
+                static_cast<int>(DataFormat::NONE)));
+    return std::vector<DataFormat>(context->operator_def()->input_size(),
+                                   op_data_format);
+  };
+}
+
+void OpRegistrationInfo::AddDevice(DeviceType device) {
+  devices.insert(device);
+}
+
+void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
+  VLOG(3) << "Registering: " << key;
+  MACE_CHECK(creators.count(key) == 0, "Key already registered: ", key);
+  creators[key] = std::move(creator);
+}
+
+}  // namespace mace
diff --git a/mace/core/registry/op_registration_info.h b/mace/core/registry/op_registration_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed110a3c07bfe040bb3ea53f8e99d92523326513
--- /dev/null
+++ b/mace/core/registry/op_registration_info.h
@@ -0,0 +1,56 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_CORE_REGISTRY_OP_REGISTRATION_INFO_H_
+#define MACE_CORE_REGISTRY_OP_REGISTRATION_INFO_H_
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "mace/core/ops/operator.h"
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+class OpConstructContext;
+class OpConditionContext;
+
+class OpRegistrationInfo {
+ public:
+  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
+      OpCreator;
+  typedef std::function<std::set<DeviceType>(OpConditionContext *)>
+      DevicePlacer;
+  typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
+  typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
+      DataFormatSelector;
+
+  OpRegistrationInfo();
+
+  void AddDevice(DeviceType);
+
+  void Register(const std::string &key, OpCreator creator);
+
+  std::set<DeviceType> devices;
+  std::unordered_map<std::string, OpCreator> creators;
+  DevicePlacer device_placer;
+  MemoryTypeSetter memory_type_setter;
+  DataFormatSelector data_format_selector;
+};
+}  // namespace mace
+
+#endif  // MACE_CORE_REGISTRY_OP_REGISTRATION_INFO_H_
diff --git a/mace/core/registry/ops_registry.cc b/mace/core/registry/ops_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a99c9e1d9a4ce00d94254a92b01c9384e25271c
--- /dev/null
+++ b/mace/core/registry/ops_registry.cc
@@ -0,0 +1,149 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/registry/ops_registry.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace mace {
+namespace {
+class OpKeyBuilder {
+ public:
+  explicit OpKeyBuilder(const std::string &op_name);
+
+  OpKeyBuilder &Device(DeviceType device);
+
+  OpKeyBuilder &TypeConstraint(const char *attr_name,
+                               DataType allowed);
+
+  const std::string Build();
+
+ private:
+  std::string op_name_;
+  DeviceType device_type_;
+  std::map<std::string, DataType> type_constraint_;
+};
+
+OpKeyBuilder::OpKeyBuilder(const std::string &op_name) : op_name_(op_name) {}
+
+OpKeyBuilder &OpKeyBuilder::Device(DeviceType device) {
+  device_type_ = device;
+  return *this;
+}
+
+OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
+                                           DataType allowed) {
+  type_constraint_[attr_name] = allowed;
+  return *this;
+}
+
+const std::string OpKeyBuilder::Build() {
+  static const std::vector<std::string> type_order = {"T"};
+  std::stringstream ss;
+  ss << op_name_;
+  ss << device_type_;
+  for (auto type : type_order) {
+    ss << type << "_" << DataTypeToString(type_constraint_[type]);
+  }
+
+  return ss.str();
+}
+}  // namespace
+
+MaceStatus OpRegistry::Register(
+    const std::string &op_type,
+    const DeviceType device_type,
+    const DataType dt,
+    OpRegistrationInfo::OpCreator creator) {
+  if (registry_.count(op_type) == 0) {
+    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
+        new OpRegistrationInfo);
+  }
+  registry_[op_type]->AddDevice(device_type);
+
+  std::string op_key = OpKeyBuilder(op_type)
+      .Device(device_type)
+      .TypeConstraint("T", dt)
+      .Build();
+  registry_.at(op_type)->Register(op_key, creator);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus OpRegistry::Register(
+    const OpConditionBuilder &builder) {
+  std::string op_type = builder.type();
+  if (registry_.count(op_type) == 0) {
+    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
+        new OpRegistrationInfo);
+  }
+  builder.Finalize(registry_[op_type].get());
+  return MaceStatus::MACE_SUCCESS;
+}
+
+const std::set<DeviceType> OpRegistry::AvailableDevices(
+    const std::string &op_type, OpConditionContext *context) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+
+  return registry_.at(op_type)->device_placer(context);
+}
+
+void OpRegistry::GetInOutMemoryTypes(
+    const std::string &op_type,
+    OpConditionContext *context) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered. op_type=", op_type);
+  return registry_.at(op_type)->memory_type_setter(context);
+}
+
+const std::vector<DataFormat> OpRegistry::InputsDataFormat(
+    const std::string &op_type,
+    OpConditionContext *context) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+  return registry_.at(op_type)->data_format_selector(context);
+}
+
+std::unique_ptr<Operation> OpRegistry::CreateOperation(
+    OpConstructContext *context,
+    DeviceType device_type) const {
+  auto operator_def = context->operator_def();
+  DataType dtype = static_cast<DataType>(
+      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+          *operator_def, "T", static_cast<int>(DT_FLOAT)));
+  VLOG(1) << "Creating operator " << operator_def->name() << "("
+          << operator_def->type() << "<" << dtype << ">" << ") on "
+          << device_type;
+  const std::string op_type = context->operator_def()->type();
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+
+  auto key_dtype =
+      (device_type == DeviceType::GPU && dtype == DT_HALF) ? DT_FLOAT : dtype;
+  std::string key = OpKeyBuilder(op_type)
+      .Device(device_type)
+      .TypeConstraint("T", key_dtype)
+      .Build();
+  if (registry_.at(op_type)->creators.count(key) == 0) {
+    LOG(FATAL) << "Key not registered: " << key
+               << ", op type is: " << operator_def->type();
+  }
+  return registry_.at(op_type)->creators.at(key)(context);
+}
+
+}  // namespace mace
diff --git a/mace/core/registry/ops_registry.h b/mace/core/registry/ops_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..46476a64d157e6446b5668279e7adedd2df4eec5
--- /dev/null
+++ b/mace/core/registry/ops_registry.h
@@ -0,0 +1,99 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_CORE_REGISTRY_OPS_REGISTRY_H_
+#define MACE_CORE_REGISTRY_OPS_REGISTRY_H_
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "mace/core/ops/operator.h"
+#include "mace/core/ops/op_condition_builder.h"
+#include "mace/core/ops/op_condition_context.h"
+#include "mace/public/mace.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/utils/memory.h"
+
+namespace mace {
+
+class OpRegistry {
+ public:
+  OpRegistry() = default;
+  virtual ~OpRegistry() = default;
+  MaceStatus Register(const std::string &op_type,
+                      const DeviceType device_type,
+                      const DataType dt,
+                      OpRegistrationInfo::OpCreator creator);
+
+  MaceStatus Register(const OpConditionBuilder &builder);
+
+  const std::set<DeviceType> AvailableDevices(
+      const std::string &op_type, OpConditionContext *context) const;
+
+  void GetInOutMemoryTypes(
+      const std::string &op_type, OpConditionContext *context) const;
+
+  const std::vector<DataFormat> InputsDataFormat(
+      const std::string &op_type, OpConditionContext *context) const;
+
+  std::unique_ptr<Operation> CreateOperation(
+      OpConstructContext *context,
+      DeviceType device_type) const;
+
+  template<class DerivedType>
+  static std::unique_ptr<Operation> DefaultCreator(
+      OpConstructContext *context) {
+    return make_unique<DerivedType>(context);
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<OpRegistrationInfo>>
+      registry_;
+  MACE_DISABLE_COPY_AND_ASSIGN(OpRegistry);
+};
+
+#define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
+  op_registry->Register(op_type,                                       \
+                        device,                                        \
+                        DataTypeToEnum<dt>::value,                     \
+                        OpRegistry::DefaultCreator<class_name<device, dt>>)
+
+#define MACE_REGISTER_OP_BY_CLASS(\
+    op_registry, op_type, class_name, device, dt)  \
+  op_registry->Register(op_type,                   \
+                        device,                    \
+                        DataTypeToEnum<dt>::value, \
+                        OpRegistry::DefaultCreator<class_name>)
+
+#ifdef MACE_ENABLE_OPENCL
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
+  op_registry->Register(                                       \
+      op_type,                                                 \
+      DeviceType::GPU,                                         \
+      DT_FLOAT,                                                \
+      OpRegistry::DefaultCreator<class_name<DeviceType::GPU, float>>)
+#else
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
+#endif
+
+#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
+  op_registry->Register(builder)
+
+}  // namespace mace
+
+#endif  // MACE_CORE_REGISTRY_OPS_REGISTRY_H_
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index fa9a58915b4a87a1a0d826180839bd103d515d23..08bf59b055714a95b18b6530ae03e38dfacd4acb 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -46,7 +46,7 @@ bool HasHalfTensor(const NetDef &net_def) {
   return false;
 }
 
-template <typename T>
+template<typename T>
 void DequantizeTensor(Device *device,
                       const unsigned char *model_data,
                       const ConstTensor &const_tensor,
@@ -66,7 +66,8 @@ void DequantizeTensor(Device *device,
 
 }  // namespace
 
-Workspace::Workspace() = default;
+Workspace::Workspace(const OpDelegatorRegistry *registry) :
+    op_delegator_registry_(registry) {}
 
 Tensor *Workspace::CreateTensor(const std::string &name,
                                 Allocator *alloc,
@@ -144,7 +145,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 
         DataType dst_data_type = const_tensor.data_type();
         if (device_type == DeviceType::CPU &&
-             const_tensor.data_type() == DataType::DT_HALF) {
+            const_tensor.data_type() == DataType::DT_HALF) {
           dst_data_type = DataType::DT_FLOAT;
         } else if (!is_quantize_model && const_tensor.quantized()) {
           if (device_type == GPU && net_def.data_type() != DataType::DT_FLOAT) {
@@ -173,13 +174,13 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
 
         if (device_type == DeviceType::CPU &&
             const_tensor.data_type() == DataType::DT_HALF) {
-            // uncompress the weights of fp16
-            auto org_data = reinterpret_cast<const half *>(
-                model_data + const_tensor.offset());
-            float *dst_data = tensor->mutable_data<float>();
-            for (int i = 0; i < const_tensor.data_size(); ++i) {
-              dst_data[i] = half_float::half_cast<float>(org_data[i]);
-            }
+          // uncompress the weights of fp16
+          auto org_data = reinterpret_cast<const half *>(
+              model_data + const_tensor.offset());
+          float *dst_data = tensor->mutable_data<float>();
+          for (int i = 0; i < const_tensor.data_size(); ++i) {
+            dst_data[i] = half_float::half_cast<float>(org_data[i]);
+          }
         } else if (!is_quantize_model && const_tensor.quantized()) {
           // uncompress the weights of uint8
           if (dst_data_type != DT_FLOAT) {
@@ -401,4 +402,8 @@ void Workspace::RemoveTensor(const std::string &name) {
   }
 }
 
+const OpDelegatorRegistry *Workspace::GetDelegatorRegistry() const {
+  return op_delegator_registry_;
+}
+
 }  // namespace mace
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 4308f92477de911e5c9a376bea59064aed1590e6..eae7ebd568140aa7ec4f65730ff15ce0c59fae8c 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -27,13 +27,14 @@
 
 namespace mace {
 
+class OpDelegatorRegistry;
 class MemoryOptimizer;
 
 class Workspace {
  public:
   typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
 
-  Workspace();
+  explicit Workspace(const OpDelegatorRegistry *registry);
   ~Workspace() {}
 
   Tensor *CreateTensor(const std::string &name,
@@ -71,15 +72,16 @@ class Workspace {
 
   void RemoveTensor(const std::string &name);
 
+  const OpDelegatorRegistry *GetDelegatorRegistry() const;
+
  private:
   TensorMap tensor_map_;
-
   std::unique_ptr<BufferBase> tensor_buffer_;
-
   PreallocatedPooledAllocator preallocated_allocator_;
-
   bool diffused_buffer_;
 
+  const OpDelegatorRegistry *op_delegator_registry_;
+
   MACE_DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index b9d3b13c24f1490c688d775f51534c2094c6f377..6ab855f42a0654ec3b8040c27bc66831f7f937af 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -19,8 +19,10 @@
 #include "mace/core/device_context.h"
 #include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
-#include "mace/ops/registry/ops_registry.h"
+#include "mace/core/registry/ops_registry.h"
+#include "mace/core/registry/op_delegator_registry.h"
 #include "mace/ops/common/transpose.h"
+#include "mace/ops/registry/registry.h"
 #include "mace/utils/math.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/stl_util.h"
@@ -451,7 +453,8 @@ class MaceEngine::Impl {
 
  private:
   std::unique_ptr<port::ReadOnlyMemoryRegion> model_data_;
-  std::unique_ptr<OpRegistryBase> op_registry_;
+  std::unique_ptr<OpRegistry> op_registry_;
+  std::unique_ptr<OpDelegatorRegistry> op_delegator_registry_;
   DeviceType device_type_;
   std::unique_ptr<Device> device_;
   std::unique_ptr<Workspace> ws_;
@@ -478,9 +481,10 @@ class MaceEngine::Impl {
 MaceEngine::Impl::Impl(const MaceEngineConfig &config)
     : model_data_(nullptr),
       op_registry_(new OpRegistry),
+      op_delegator_registry_(new OpDelegatorRegistry),
       device_type_(config.impl_->device_type()),
       device_(nullptr),
-      ws_(new Workspace()),
+      ws_(new Workspace(op_delegator_registry_.get())),
       net_(nullptr),
       is_quantized_model_(false),
       thread_pool_(new utils::ThreadPool(config.impl_->num_threads(),
@@ -498,6 +502,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
 #endif
 {
   LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
+  ops::RegisterAllOps(op_registry_.get());
+  ops::RegisterAllOpDelegators(op_delegator_registry_.get());
   thread_pool_->Init();
   if (device_type_ == DeviceType::CPU) {
     device_.reset(new CPUDevice(config.impl_->num_threads(),
diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel
index 9861198aaa49b99dec5302a0c934f2947e39fc7d..52ad46edfde322f45d12becbb249261beed12498 100644
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -22,11 +22,13 @@ cc_library(
     srcs = glob(
         [
             "common/*.cc",
+            "delegator/*.cc",
         ],
     ),
     hdrs = glob(
         [
             "common/*.h",
+            "delegator/*.h",
         ],
     ),
     copts = [
@@ -58,12 +60,16 @@ cc_library(
         [
             "ref/*.cc",
         ],
-    ),
+    ) + if_quantize_enabled(glob([
+        "ref/q8/*.cc",
+    ])),
     hdrs = glob(
         [
             "ref/*.h",
         ],
-    ),
+    ) + if_quantize_enabled(glob([
+        "ref/q8/*.h",
+    ])),
     copts = [
         "-Werror",
         "-Wextra",
@@ -236,12 +242,12 @@ cc_library(
 
 cc_library(
     name = "ops",
-    srcs = [
-        "registry/ops_registry.cc",
-    ],
-    hdrs = [
-        "registry/ops_registry.h",
-    ],
+    srcs = glob([
+        "registry/*.cc",
+    ]),
+    hdrs = glob([
+        "registry/*.h",
+    ]),
     copts = [
         "-Werror",
         "-Wextra",
diff --git a/mace/ops/CMakeLists.txt b/mace/ops/CMakeLists.txt
index 7994b445a6bc7aabb82f3198c2c2405f857b4e1b..7de9661d61d05cd6e4ac9d551cbccbb38904f7d4 100644
--- a/mace/ops/CMakeLists.txt
+++ b/mace/ops/CMakeLists.txt
@@ -1,6 +1,10 @@
 file(GLOB OPS_COMMON_SRCS common/*.cc)
 file(GLOB OPS_REF_KERNELS_SRCS ref/*.cc)
 
+file(GLOB OPS_REF_Q8_KERNELS_SRCS
+  ref/q8/*.cc
+)
+
 file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS
   arm/fp32/*.cc
 )
@@ -17,20 +21,23 @@ file(GLOB OPS_OPENCL_KERNELS_SRCS
 
 file(GLOB OPS_INTERNAL_OPS_SRCS *.cc)
 
-set(OPS_SRCS registry/ops_registry.cc)
+set(OPS_SRCS registry/ops_registry.cc registry/op_delegators_registry.cc)
 set(OPS_SRCS ${OPS_SRCS} ${OPS_COMMON_SRCS})
 set(OPS_SRCS ${OPS_SRCS} ${OPS_INTERNAL_OPS_SRCS})
 # TODO we need to remove this in production build
 set(OPS_SRCS ${OPS_SRCS} ${OPS_REF_KERNELS_SRCS}) 
 
+if(MACE_ENABLE_QUANTIZE)
+  set(OPS_SRCS ${OPS_SRCS} ${OPS_REF_Q8_KERNELS_SRCS})
+endif(MACE_ENABLE_QUANTIZE)
+
 if(MACE_ENABLE_NEON)
   set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
+  if(MACE_ENABLE_QUANTIZE)
+    set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
+  endif(MACE_ENABLE_QUANTIZE)
 endif(MACE_ENABLE_NEON)
 
-if(MACE_ENABLE_QUANTIZE)
-  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
-endif(MACE_ENABLE_QUANTIZE)
-
 if(MACE_ENABLE_OPENCL)
   set(OPS_SRCS ${OPS_SRCS} ${OPS_OPENCL_KERNELS_SRCS})
 endif(MACE_ENABLE_OPENCL)
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index 255370568b6eb7a8702900b85b0e2c99d4606a6b..338de7ead4db24e35169bdc6cd681729e84b15b8 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -17,13 +17,10 @@
 #include <memory>
 #include <set>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
-#if defined(MACE_ENABLE_NEON)
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/activation.h"
-#endif
+#include "mace/ops/delegator/activation.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -37,19 +34,20 @@ namespace ops {
 template<DeviceType D, class T>
 class ActivationOp;
 
-template<>
-class ActivationOp<DeviceType::CPU, float> : public Operation {
+template<typename T>
+class ActivationOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit ActivationOp(OpConstructContext *context)
       : Operation(context),
         activation_type_(ops::StringToActivationType(
-            Operation::GetOptionalArg<std::string>("activation",
-                                                   "NOOP"))),
-        activation_delegator_(activation_type_,
-                              Operation::GetOptionalArg<float>("max_limit",
-                                                               0.0f),
-                              Operation::GetOptionalArg<float>(
-                                  "leakyrelu_coefficient", 0.0f)) {}
+            Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
+        activation_delegator_(delegator::Activation::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Activation, CPU, T, MACE_CPU_IMPL_TYPE),
+            delegator::ActivationParam(
+                activation_type_,
+                Operation::GetOptionalArg<T>("max_limit", 0),
+                Operation::GetOptionalArg<T>("leakyrelu_coefficient", 0)))) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -58,28 +56,24 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
 
     if (activation_type_ == PRELU) {
       MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-      const float *input_ptr = input->data<float>();
-      float *output_ptr = output->mutable_data<float>();
+      const T *input_ptr = input->data<T>();
+      T *output_ptr = output->mutable_data<T>();
       MACE_CHECK(this->InputSize() > 1);
       const Tensor *alpha = this->Input(1);
-      const float *alpha_ptr = alpha->data<float>();
+      const T *alpha_ptr = alpha->data<T>();
       const index_t outer_size = output->dim(0);
       const index_t inner_size = output->dim(2) * output->dim(3);
       PReLUActivation(context, input_ptr, outer_size, input->dim(1), inner_size,
                       alpha_ptr, output_ptr);
     } else {
-      activation_delegator_.Compute(context, input, output);
+      activation_delegator_->Compute(context, input, output);
     }
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
   ActivationType activation_type_;
-#if defined(MACE_ENABLE_NEON)
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
@@ -122,7 +116,7 @@ class ActivationOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterActivation(OpRegistryBase *op_registry) {
+void RegisterActivation(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index 9ceae6e07ff983e5c577406d60b6616c56da4fc3..4003dd309331a59d64c2ff6ace5299e7cc9587a6 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -20,7 +20,7 @@
 #include <string>
 
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/utils/logging.h"
 
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index 5b98ba8554caa69929adacefe27b94499d274cd9..8e5ce2e1928a1244ccd0ee27a3aa8c9bdc7a5ec7 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -19,7 +19,8 @@
 #include <algorithm>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/addn.h"
@@ -92,7 +93,7 @@ class AddNOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterAddN(OpRegistryBase *op_registry) {
+void RegisterAddN(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
   MACE_REGISTER_OP_CONDITION(
diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc
index 32007d6ccbcd59cd78670ad7f46aced4a3e6fa4c..5ec9dc92b818196b53ba60c0886467f5f2618bb4 100644
--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
@@ -18,7 +18,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -109,7 +110,7 @@ class ArgMaxOp : public Operation {
 
 
 
-void RegisterArgMax(OpRegistryBase *op_registry) {
+void RegisterArgMax(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/arm/fp32/activation.cc b/mace/ops/arm/fp32/activation.cc
index cac3badb523262663820b93e2527588f49be4923..8c66bd563093a20941c64a50faa2a68aad891710 100644
--- a/mace/ops/arm/fp32/activation.cc
+++ b/mace/ops/arm/fp32/activation.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/activation.h"
+#include "mace/ops/delegator/activation.h"
 
 #include <arm_neon.h>
 #include <algorithm>
@@ -22,16 +22,22 @@ namespace ops {
 namespace arm {
 namespace fp32 {
 
-Activation::Activation(ActivationType type,
-                       const float limit,
-                       const float leakyrelu_coefficient)
-    : type_(type),
-      limit_(limit),
-      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+class Activation : public delegator::Activation {
+ public:
+  explicit Activation(const delegator::ActivationParam &param)
+      : delegator::Activation(param) {}
+  ~Activation() = default;
+
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input, Tensor *output) override;
+
+ private:
+  void DoActivation(const OpContext *context,
+                    const Tensor *input, Tensor *output);
+};
 
 MaceStatus Activation::Compute(const OpContext *context,
-                               const Tensor *input,
-                               Tensor *output) {
+                               const Tensor *input, Tensor *output) {
   Tensor::MappingGuard input_guard(input);
   if (input != output) {
     MACE_RETURN_IF_ERROR(output->ResizeLike(input));
@@ -139,7 +145,7 @@ void Activation::DoActivation(const OpContext *context,
       // remain
       for (index_t i = block_count * 4; i < size; ++i) {
         output_data[i] = std::max(input_data[i], 0.f) +
-                         std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
+            std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
       }
 
       break;
@@ -169,14 +175,19 @@ void Activation::DoActivation(const OpContext *context,
       break;
     }
 
-    case NOOP:
+    case NOOP: {
       break;
+    }
 
-    default:
+    default: {
       MACE_NOT_IMPLEMENTED;
+    }
   }
 }
 
+MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam,
+                        MACE_DELEGATOR_KEY(Activation, CPU, float, NEON))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/activation.h b/mace/ops/arm/fp32/activation.h
deleted file mode 100644
index 265915d0c3a8d3bdbab3e4c0d0f60521730dec34..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/activation.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_ACTIVATION_H_
-#define MACE_OPS_ARM_FP32_ACTIVATION_H_
-
-#include "mace/core/op_context.h"
-#include "mace/ops/common/activation_type.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class Activation {
- public:
-  explicit Activation(ActivationType type,
-                      const float limit,
-                      const float leakyrelu_coefficient);
-  ~Activation() = default;
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      Tensor *output);
-
- private:
-  void DoActivation(const OpContext *context,
-                    const Tensor *input,
-                    Tensor *output);
-
-  ActivationType type_;
-  const float limit_;
-  const float leakyrelu_coefficient_;
-};
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_ACTIVATION_H_
diff --git a/mace/ops/arm/fp32/bias_add.cc b/mace/ops/arm/fp32/bias_add.cc
index f572b22bbd1cfe80b39ff7e2c76727aa8b437fac..fc5a55b3d4d0abf6cdad15bfd540bb20446803af 100644
--- a/mace/ops/arm/fp32/bias_add.cc
+++ b/mace/ops/arm/fp32/bias_add.cc
@@ -12,15 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/bias_add.h"
-
 #include <arm_neon.h>
+#include "mace/ops/delegator/bias_add.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
 
+class BiasAdd : public delegator::BiasAdd {
+ public:
+  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
+  ~BiasAdd() = default;
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *bias, Tensor *output) override;
+
+ private:
+  void AddBias(const OpContext *context, const Tensor *input,
+               const Tensor *bias, Tensor *output);
+};
+
 MaceStatus BiasAdd::Compute(const OpContext *context,
                             const Tensor *input,
                             const Tensor *bias,
@@ -117,6 +129,9 @@ void BiasAdd::AddBias(const OpContext *context,
   }
 }
 
+MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam,
+                        MACE_DELEGATOR_KEY(BiasAdd, CPU, float, NEON))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/bias_add.h b/mace/ops/arm/fp32/bias_add.h
deleted file mode 100644
index a3e6849157472bc9df8117299cf3f0d01ca203d8..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/bias_add.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_BIAS_ADD_H_
-#define MACE_OPS_ARM_FP32_BIAS_ADD_H_
-
-#include "mace/core/op_context.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class BiasAdd {
- public:
-  BiasAdd() = default;
-  ~BiasAdd() = default;
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *bias,
-      Tensor *output);
-
- private:
-  void AddBias(const OpContext *context,
-               const Tensor *input,
-               const Tensor *bias,
-               Tensor *output);
-};
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_BIAS_ADD_H_
diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h
index dc8d0effd101e77df88473c884fcdb670768379e..a143f5f84c2092c614d60576e27e26ec69d7e3a3 100644
--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
@@ -18,36 +18,25 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
 
-class Conv2dBase {
+class Conv2dBase : public delegator::Conv2d {
  public:
-  Conv2dBase(const std::vector<int> &strides,
-             const std::vector<int> &dilations,
-             const std::vector<int> &paddings,
-             const Padding padding_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type) {}
+  explicit Conv2dBase(const delegator::Conv2dParam &param)
+      : delegator::Conv2d(param) {}
 
   virtual ~Conv2dBase() = default;
 
-  virtual MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) = 0;
-
  protected:
   void CalOutputShapeAndInputPadSize(const std::vector<index_t> &input_shape,
                                      const std::vector<index_t> &filter_shape,
@@ -83,11 +72,6 @@ class Conv2dBase {
                 const int pad_left,
                 Tensor *dst);
   void UnPadOutput(const Tensor &src, Tensor *dst);
-
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/fp32/conv_2d_1x1.cc
index d5e03652bbd25bad8eb43bfb67b2ef98092b9b2f..0aad6be90729aac36bd09d1f9a3bea57ddb82b8b 100644
--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
@@ -12,13 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d_1x1.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/delegator/conv_2d.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
 
+class Conv2dK1x1 : public Conv2dBase {
+ public:
+  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param),
+        gemm_(delegator::GemmParam()) {}
+  virtual ~Conv2dK1x1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+
+ private:
+  Gemm gemm_;
+};
+
 MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                                const Tensor *input,
                                const Tensor *filter,
@@ -94,6 +113,9 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                        output);
 }
 
+MACE_REGISTER_DELEGATOR(registry, Conv2dK1x1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x1))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.h b/mace/ops/arm/fp32/conv_2d_1x1.h
deleted file mode 100644
index cde94ea01927ad544bb347eaea53bcb55b01f7f8..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/conv_2d_1x1.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
-
-#include <vector>
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class Conv2dK1x1 : public Conv2dBase {
- public:
-  Conv2dK1x1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
-  virtual ~Conv2dK1x1() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-
- private:
-  Gemm gemm_;
-};
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc
index 3be9e3eb5dca7ecf4ecf66b1371796872c5cd0b5..fc92091f55edf6f9d9eac7a6a285f718d62034e0 100644
--- a/mace/ops/arm/fp32/conv_2d_1xn.cc
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
@@ -17,6 +17,8 @@
 #include <arm_neon.h>
 #include <memory>
 
+#include "mace/ops/delegator/conv_2d.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
@@ -859,6 +861,19 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Conv2dK1x7S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x7S1))
+
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x1S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x1S1))
+
+MACE_REGISTER_DELEGATOR(registry, Conv2dK1x15S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                              NEON, K1x15S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK15x1S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                              NEON, K15x1S1))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/fp32/conv_2d_1xn.h
index 0bdd66737907627f7dd44e1cb94c24803ea0c8fc..c0a6da637e3ecffd74da458c71730a8646e365c3 100644
--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
@@ -16,10 +16,11 @@
 #define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
 
 #include <vector>
-#include "mace/public/mace.h"
+
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -28,8 +29,8 @@ namespace fp32 {
 
 class Conv2dK1x7S1 : public Conv2dBase {
  public:
-  Conv2dK1x7S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK1x7S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK1x7S1() {}
 
   MaceStatus Compute(
@@ -41,8 +42,8 @@ class Conv2dK1x7S1 : public Conv2dBase {
 
 class Conv2dK7x1S1 : public Conv2dBase {
  public:
-  Conv2dK7x1S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK7x1S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK7x1S1() {}
 
   MaceStatus Compute(
@@ -54,8 +55,8 @@ class Conv2dK7x1S1 : public Conv2dBase {
 
 class Conv2dK1x15S1 : public Conv2dBase {
  public:
-  Conv2dK1x15S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK1x15S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK1x15S1() {}
 
   MaceStatus Compute(
@@ -67,8 +68,8 @@ class Conv2dK1x15S1 : public Conv2dBase {
 
 class Conv2dK15x1S1 : public Conv2dBase {
  public:
-  Conv2dK15x1S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK15x1S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK15x1S1() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc
index 95c3034138d9ecab67d1aae0ee770ff07ab20788..37d8ef849f73e53d4afebc55ac19efe50fe7c02b 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
@@ -17,6 +17,8 @@
 #include <arm_neon.h>
 #include <memory>
 
+#include "mace/ops/delegator/conv_2d.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
@@ -735,6 +737,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S2, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S2))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/fp32/conv_2d_3x3.h
index bd96501d98f32ebe9ffe0bad98cccee67bc0b062..e64d061e3e6103f78901c144d9866d047e8dfc96 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
@@ -16,10 +16,11 @@
 #define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
 
 #include <vector>
-#include "mace/public/mace.h"
+
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -28,8 +29,8 @@ namespace fp32 {
 
 class Conv2dK3x3S1 : public Conv2dBase {
  public:
-  Conv2dK3x3S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK3x3S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK3x3S1() {}
 
   MaceStatus Compute(
@@ -41,8 +42,8 @@ class Conv2dK3x3S1 : public Conv2dBase {
 
 class Conv2dK3x3S2 : public Conv2dBase {
  public:
-  Conv2dK3x3S2(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK3x3S2(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK3x3S2() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
index ab2517bf6295691de4ba00fd22d9e651e1e13fee..cbdb7d66443e5d47759dcb8fe44890f85f2c4d5a 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -17,6 +17,7 @@
 #include <algorithm>
 
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
 
@@ -800,6 +801,10 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
   }, 0, batch, 1, 0, out_channels, 1);
 }
 
+MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(
+                            Conv2d, CPU, float, NEON, K3x3Winograd))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
index 53118a6aea3b2d8d3a75b08fa5d0b0f84ef69203..ec4db81bb2d552615430b81e330ef0ff862c563f 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -18,11 +18,11 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -31,10 +31,9 @@ namespace fp32 {
 
 class Conv2dK3x3Winograd : public Conv2dBase {
  public:
-  Conv2dK3x3Winograd(const std::vector<int> &paddings,
-                     const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type),
-        gemm_(),
+  explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param)
+      : Conv2dBase(param),
+        gemm_(delegator::GemmParam()),
         transformed_filter_(nullptr),
         out_tile_size_(0) {}
 
diff --git a/mace/ops/arm/fp32/conv_2d_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc
index 1b41ec7ccd87a14e5683e1f84bc6f967e159b5b3..cc117cf98637b2f886007ae15ffe75d47f884ff0 100644
--- a/mace/ops/arm/fp32/conv_2d_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -12,16 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d_5x5.h"
-
 #include <arm_neon.h>
 #include <memory>
 
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/delegator/conv_2d.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
 
+class Conv2dK5x5S1 : public Conv2dBase {
+ public:
+  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
+  virtual ~Conv2dK5x5S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+};
+
 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
   /* load filter (4 outch x 1 height x 4 width) */        \
   float32x4_t vf00, vf10, vf20, vf30;                     \
@@ -244,6 +258,9 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Conv2dK5x5S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K5x5S1))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/conv_2d_5x5.h b/mace/ops/arm/fp32/conv_2d_5x5.h
deleted file mode 100644
index b6fdf9bbda9d7edc7593a08e30ce6f30987de2a4..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/conv_2d_5x5.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
-
-#include <vector>
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class Conv2dK5x5S1 : public Conv2dBase {
- public:
-  Conv2dK5x5S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
-  virtual ~Conv2dK5x5S1() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
-
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc
index 4ee8a045a8c61e72fb615816af0fc9c52b77f9b9..cc6963e7b1b8cd7eda4a09cb74a57d5f5ac3b6b2 100644
--- a/mace/ops/arm/fp32/conv_2d_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
@@ -17,6 +17,8 @@
 #include <arm_neon.h>
 #include <memory>
 
+#include "mace/ops/delegator/conv_2d.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
@@ -720,6 +722,13 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S2, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S2))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S3, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S3))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/fp32/conv_2d_7x7.h
index 9324f4daac2392cb069935d3d46fc36274e8b8ea..0d0467fc5b38a354bab744503dafbe28b5f180f3 100644
--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
@@ -16,10 +16,11 @@
 #define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
 
 #include <vector>
-#include "mace/public/mace.h"
+
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -28,8 +29,8 @@ namespace fp32 {
 
 class Conv2dK7x7S1 : public Conv2dBase {
  public:
-  Conv2dK7x7S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK7x7S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK7x7S1() {}
 
   MaceStatus Compute(
@@ -41,8 +42,8 @@ class Conv2dK7x7S1 : public Conv2dBase {
 
 class Conv2dK7x7S2 : public Conv2dBase {
  public:
-  Conv2dK7x7S2(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK7x7S2(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK7x7S2() {}
 
   MaceStatus Compute(
@@ -54,8 +55,8 @@ class Conv2dK7x7S2 : public Conv2dBase {
 
 class Conv2dK7x7S3 : public Conv2dBase {
  public:
-  Conv2dK7x7S3(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {}
+  explicit Conv2dK7x7S3(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~Conv2dK7x7S3() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc
index 25fb2441481cb5ac55da78e44327478b513de018..2fdc57e2ef7d9e0f029919249a0bb776d5183879 100644
--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_general.cc
@@ -12,15 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_general.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
 
 #include <memory>
 
+#include "mace/ops/delegator/conv_2d.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
 
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  explicit Conv2dGeneral(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
+  virtual ~Conv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+};
+
 MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                                   const Tensor *input,
                                   const Tensor *filter,
@@ -237,6 +252,10 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(
+    registry, Conv2dGeneral, delegator::Conv2dParam,
+    MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, General))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/conv_general.h b/mace/ops/arm/fp32/conv_general.h
deleted file mode 100644
index 115acdb3fe83cb80e1e20e7939c5fe03eed7c6da..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/conv_general.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_
-#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_
-
-#include <vector>
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class Conv2dGeneral : public Conv2dBase {
- public:
-  Conv2dGeneral(const std::vector<int> &strides,
-                const std::vector<int> &dilations,
-                const std::vector<int> &paddings,
-                const Padding padding_type)
-      : Conv2dBase(strides, dilations, paddings, padding_type) {}
-  virtual ~Conv2dGeneral() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_CONV_GENERAL_H_
diff --git a/mace/ops/arm/fp32/deconv_2d.h b/mace/ops/arm/fp32/deconv_2d.h
index 554f2935992d0a6f901bbb7b40aab4b048d63616..128d5858beee4a8530ed3f775536fb3d1652c44b 100644
--- a/mace/ops/arm/fp32/deconv_2d.h
+++ b/mace/ops/arm/fp32/deconv_2d.h
@@ -18,54 +18,27 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/deconv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
 
-class Deconv2dBase {
+class Deconv2dBase : public delegator::Deconv2d {
  public:
-  Deconv2dBase(const std::vector<int> &strides,
-               const std::vector<int> &dilations,
-               const std::vector<int> &paddings,
-               const Padding padding_type,
-               const index_t group,
-               const FrameworkType framework_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type),
-        group_(group),
-        framework_type_(framework_type) {}
-
-  Deconv2dBase(const std::vector<int> &strides,
-               const std::vector<int> &dilations,
-               const std::vector<int> &paddings,
-               const Padding padding_type,
-               const FrameworkType framework_type)
-      : Deconv2dBase(strides,
-                     dilations,
-                     paddings,
-                     padding_type,
-                     1,
-                     framework_type) {}
+  explicit Deconv2dBase(const delegator::Deconv2dParam &param)
+      : delegator::Deconv2d(param),
+        group_(param.group_) {}
 
   virtual ~Deconv2dBase() = default;
 
-  virtual MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) = 0;
-
  protected:
   MaceStatus ResizeOutAndPadOut(const OpContext *context,
                                 const Tensor *input,
@@ -78,13 +51,7 @@ class Deconv2dBase {
   void UnPadOutput(const Tensor &src,
                    const std::vector<int> &out_pad_size,
                    Tensor *dst);
-
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
   index_t group_;
-  const FrameworkType framework_type_;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.cc b/mace/ops/arm/fp32/deconv_2d_2x2.cc
index c9d630bbb63c66d72684663659965e32b2be6b60..65cfc6e8d7020e1fd753cbed9a2e7416b1ff56b9 100644
--- a/mace/ops/arm/fp32/deconv_2d_2x2.cc
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc
@@ -330,12 +330,18 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
     }
   }, 0, batch, 1, 0, outch, 1);
 
-
   UnPadOutput(*out_tensor, out_pad_size, output);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K2x2S1))
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K2x2S2))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.h b/mace/ops/arm/fp32/deconv_2d_2x2.h
index 05f80dece27fd6cf20d87861e04a512b94706939..6fd533444a2e1a1e910c2d527987112940ddb4cc 100644
--- a/mace/ops/arm/fp32/deconv_2d_2x2.h
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.h
@@ -18,12 +18,12 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -32,10 +32,8 @@ namespace fp32 {
 
 class Deconv2dK2x2S1 : public Deconv2dBase {
  public:
-  Deconv2dK2x2S1(const std::vector<int> &paddings,
-                 const Padding padding_type,
-                 const FrameworkType framework_type)
-      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
+  explicit Deconv2dK2x2S1(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~Deconv2dK2x2S1() {}
 
   MaceStatus Compute(
@@ -48,10 +46,8 @@ class Deconv2dK2x2S1 : public Deconv2dBase {
 
 class Deconv2dK2x2S2 : public Deconv2dBase {
  public:
-  Deconv2dK2x2S2(const std::vector<int> &paddings,
-                 const Padding padding_type,
-                 const FrameworkType framework_type)
-      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
+  explicit Deconv2dK2x2S2(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~Deconv2dK2x2S2() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.cc b/mace/ops/arm/fp32/deconv_2d_3x3.cc
index b2ef6eae269316c9169e33bbb753606d8572c1ff..55911e25f432a21290295018eefacedb00cfd25d 100644
--- a/mace/ops/arm/fp32/deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc
@@ -464,6 +464,13 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K3x3S2))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.h b/mace/ops/arm/fp32/deconv_2d_3x3.h
index 4495cbe8e4ef5fa3b05c72e9970fa05fb67a7fbb..65cc23e6f365d9809d983c94bc12855760046a17 100644
--- a/mace/ops/arm/fp32/deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.h
@@ -18,12 +18,12 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -32,10 +32,8 @@ namespace fp32 {
 
 class Deconv2dK3x3S1 : public Deconv2dBase {
  public:
-  Deconv2dK3x3S1(const std::vector<int> &paddings,
-                 const Padding padding_type,
-                 const FrameworkType framework_type)
-      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
+  explicit Deconv2dK3x3S1(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~Deconv2dK3x3S1() {}
 
   MaceStatus Compute(
@@ -48,10 +46,8 @@ class Deconv2dK3x3S1 : public Deconv2dBase {
 
 class Deconv2dK3x3S2 : public Deconv2dBase {
  public:
-  Deconv2dK3x3S2(const std::vector<int> &paddings,
-                 const Padding padding_type,
-                 const FrameworkType framework_type)
-      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
+  explicit Deconv2dK3x3S2(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~Deconv2dK3x3S2() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.cc b/mace/ops/arm/fp32/deconv_2d_4x4.cc
index 3c47ecff71bc46ea02aa73cb49d511a22c61ba27..b2e17afa75f2545d820722ad90b3297397941a56 100644
--- a/mace/ops/arm/fp32/deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc
@@ -449,7 +449,6 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-
   utils::ThreadPool
       &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
@@ -575,6 +574,13 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K4x4S1))
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K4x4S2))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.h b/mace/ops/arm/fp32/deconv_2d_4x4.h
index 9f09056af0224331fca8815cca18a1f7eecdd1cc..bf86a62ab4575ef20072dc6f1fd648f2bd65da14 100644
--- a/mace/ops/arm/fp32/deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.h
@@ -18,12 +18,12 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -32,10 +32,8 @@ namespace fp32 {
 
 class Deconv2dK4x4S1 : public Deconv2dBase {
  public:
-  Deconv2dK4x4S1(const std::vector<int> &paddings,
-                 const Padding padding_type,
-                 const FrameworkType framework_type)
-      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
+  explicit Deconv2dK4x4S1(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~Deconv2dK4x4S1() {}
 
   MaceStatus Compute(
@@ -48,10 +46,8 @@ class Deconv2dK4x4S1 : public Deconv2dBase {
 
 class Deconv2dK4x4S2 : public Deconv2dBase {
  public:
-  Deconv2dK4x4S2(const std::vector<int> &paddings,
-                 const Padding padding_type,
-                 const FrameworkType framework_type)
-      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
+  explicit Deconv2dK4x4S2(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~Deconv2dK4x4S2() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/deconv_2d_general.cc b/mace/ops/arm/fp32/deconv_2d_general.cc
index 47bfe39cf27adac58b1240afa66390fc23dc8866..5ffe7b0d7a25bf92824ee1120e65ede9b50fcc08 100644
--- a/mace/ops/arm/fp32/deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/deconv_2d_general.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/deconv_2d_general.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
 
 // TODO(liutuo): optimize it
 
@@ -21,6 +21,20 @@ namespace ops {
 namespace arm {
 namespace fp32 {
 
+class Deconv2dGeneral : public Deconv2dBase {
+ public:
+  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
+  virtual ~Deconv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
 MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
                                     const Tensor *input,
                                     const Tensor *filter,
@@ -110,6 +124,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Deconv2dGeneral, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, General))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/deconv_2d_general.h b/mace/ops/arm/fp32/deconv_2d_general.h
deleted file mode 100644
index d11ada030c02c4f155aec12e0a162513cdae0c25..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/deconv_2d_general.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
-
-#include <vector>
-#include <memory>
-
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
-#include "mace/ops/common/conv_pool_2d_util.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class Deconv2dGeneral : public Deconv2dBase {
- public:
-  Deconv2dGeneral(const std::vector<int> &strides,
-                  const std::vector<int> &dilations,
-                  const std::vector<int> &paddings,
-                  const Padding padding_type,
-                  const FrameworkType framework_type)
-      : Deconv2dBase(strides,
-                     dilations,
-                     paddings,
-                     padding_type,
-                     framework_type) {}
-  virtual ~Deconv2dGeneral() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
-};
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
index a27827b471818c049a09e532c059b56396e8f452..8d77672b7ab094771e067722f703e8bc0e27a6d1 100644
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
@@ -512,6 +512,13 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S2))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
index c130fbffd361dfb33be9974b3d603e630cb80979..49412b808dde686c26fff1b80137ab86c78d65f9 100644
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
@@ -16,10 +16,12 @@
 #define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
 
 #include <vector>
-#include "mace/public/mace.h"
+
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/delegator/depthwise_conv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -28,9 +30,8 @@ namespace fp32 {
 
 class DepthwiseConv2dK3x3S1 : public Conv2dBase {
  public:
-  DepthwiseConv2dK3x3S1(const std::vector<int> &paddings,
-                        const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~DepthwiseConv2dK3x3S1() {}
 
   MaceStatus Compute(
@@ -42,9 +43,8 @@ class DepthwiseConv2dK3x3S1 : public Conv2dBase {
 
 class DepthwiseConv2dK3x3S2 : public Conv2dBase {
  public:
-  DepthwiseConv2dK3x3S2(const std::vector<int> &paddings,
-                        const Padding padding_type)
-      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam &param)
+      : Conv2dBase(param) {}
   virtual ~DepthwiseConv2dK3x3S2() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
index 3cd6d527b7f1fa67d053cc96dea8ae6505e32352..291075ae2205d61035e211fd1c8daa04bec8c9d5 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
@@ -776,6 +776,20 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S2))
+
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S2))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
index 5dd315a47ad5e0c9a815b64ca3c5c0de63faf25e..eeb21d6c3c5d50502b268e61f3b0726066a963cb 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
@@ -18,12 +18,13 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -32,14 +33,9 @@ namespace fp32 {
 
 class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
  public:
-  DepthwiseDeconv2dK3x3S1(const std::vector<int> &paddings,
-                          const Padding padding_type,
-                          const FrameworkType framework_type)
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
+  explicit DepthwiseDeconv2dK3x3S1(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~DepthwiseDeconv2dK3x3S1() {}
 
   MaceStatus Compute(
@@ -52,14 +48,9 @@ class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
 
 class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
  public:
-  DepthwiseDeconv2dK3x3S2(const std::vector<int> &paddings,
-                          const Padding padding_type,
-                          const FrameworkType framework_type)
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
+  explicit DepthwiseDeconv2dK3x3S2(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~DepthwiseDeconv2dK3x3S2() {}
 
   MaceStatus Compute(
@@ -72,16 +63,9 @@ class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
 
 class GroupDeconv2dK3x3S1 : public Deconv2dBase {
  public:
-  GroupDeconv2dK3x3S1(const std::vector<int> &paddings,
-                      const Padding padding_type,
-                      const int group,
-                      const FrameworkType framework_type)
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
+  explicit GroupDeconv2dK3x3S1(
+      const delegator::GroupDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~GroupDeconv2dK3x3S1() {}
 
   MaceStatus Compute(
@@ -94,16 +78,8 @@ class GroupDeconv2dK3x3S1 : public Deconv2dBase {
 
 class GroupDeconv2dK3x3S2 : public Deconv2dBase {
  public:
-  GroupDeconv2dK3x3S2(const std::vector<int> &paddings,
-                      const Padding padding_type,
-                      const int group,
-                      const FrameworkType framework_type)
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
+  explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~GroupDeconv2dK3x3S2() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
index 85c93b0cef7b53dc170d48eeaa6c65154f85c8e8..f9de2de3df27aeabb4eb9199140993fbd5abb31e 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
@@ -959,6 +959,20 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S1))
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S2))
+
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S1))
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S2))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
index 4b73ed010afdd783f45e39d638db01427070e717..31d5bd99ed5cfe287026f99ac89d3721c7fed8bb 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
@@ -18,12 +18,13 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -32,14 +33,9 @@ namespace fp32 {
 
 class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
  public:
-  DepthwiseDeconv2dK4x4S1(const std::vector<int> &paddings,
-                          const Padding padding_type,
-                          const FrameworkType framework_type)
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
+  explicit DepthwiseDeconv2dK4x4S1(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~DepthwiseDeconv2dK4x4S1() {}
 
   MaceStatus Compute(
@@ -52,14 +48,9 @@ class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
 
 class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
  public:
-  DepthwiseDeconv2dK4x4S2(const std::vector<int> &paddings,
-                          const Padding padding_type,
-                          const FrameworkType framework_type)
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
+  explicit DepthwiseDeconv2dK4x4S2(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~DepthwiseDeconv2dK4x4S2() {}
 
   MaceStatus Compute(
@@ -72,16 +63,8 @@ class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
 
 class GroupDeconv2dK4x4S1 : public Deconv2dBase {
  public:
-  GroupDeconv2dK4x4S1(const std::vector<int> &paddings,
-                      const Padding padding_type,
-                      const int group,
-                      const FrameworkType framework_type)
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
+  explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~GroupDeconv2dK4x4S1() {}
 
   MaceStatus Compute(
@@ -94,16 +77,8 @@ class GroupDeconv2dK4x4S1 : public Deconv2dBase {
 
 class GroupDeconv2dK4x4S2 : public Deconv2dBase {
  public:
-  GroupDeconv2dK4x4S2(const std::vector<int> &paddings,
-                      const Padding padding_type,
-                      const int group,
-                      const FrameworkType framework_type)
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
+  explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~GroupDeconv2dK4x4S2() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
index a45d5acc6a663d370f1b741b5b15598c9fd40e22..81d715e26dbb34186bcd873b9dc083b27cd1a352 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
@@ -207,6 +207,14 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, General))
+
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, General))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
index d73480c5ea1a4fff7aa06656efb9a964acc1b01d..924924498301592de6dd1c9af6473eb61d289407 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
@@ -18,12 +18,13 @@
 #include <vector>
 #include <memory>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
@@ -32,16 +33,9 @@ namespace fp32 {
 
 class DepthwiseDeconv2dGeneral : public Deconv2dBase {
  public:
-  DepthwiseDeconv2dGeneral(const std::vector<int> &strides,
-                           const std::vector<int> &dilations,
-                           const std::vector<int> &paddings,
-                           const Padding padding_type,
-                           const FrameworkType framework_type)
-      : Deconv2dBase(strides,
-                     dilations,
-                     paddings,
-                     padding_type,
-                     framework_type) {}
+  explicit DepthwiseDeconv2dGeneral(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~DepthwiseDeconv2dGeneral() {}
 
   MaceStatus Compute(
@@ -54,18 +48,8 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase {
 
 class GroupDeconv2dGeneral : public Deconv2dBase {
  public:
-  GroupDeconv2dGeneral(const std::vector<int> &strides,
-                       const std::vector<int> &dilations,
-                       const std::vector<int> &paddings,
-                       const Padding padding_type,
-                       const int group,
-                       const FrameworkType framework_type)
-      : Deconv2dBase(strides,
-                     dilations,
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
+  explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam &param)
+      : Deconv2dBase(param) {}
   virtual ~GroupDeconv2dGeneral() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/gemm.cc b/mace/ops/arm/fp32/gemm.cc
index ff26052ffae16a064f4873151ef675c83d1ecbb3..ca429e63d544e13774eb4073c02e9fd6122ad499 100644
--- a/mace/ops/arm/fp32/gemm.cc
+++ b/mace/ops/arm/fp32/gemm.cc
@@ -1224,6 +1224,9 @@ MaceStatus Gemm::Compute(const OpContext *context,
                  output);
 }
 
+MACE_REGISTER_DELEGATOR(registry, Gemm, delegator::GemmParam,
+                        MACE_DELEGATOR_KEY(Gemm, CPU, float, NEON))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/fp32/gemm.h
index 00b4d80eef4bf27f98c54f1c77a51765cc7f530d..4910ae358347bf94eef076e63934f9365aa1ef79 100644
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
@@ -15,10 +15,11 @@
 #ifndef MACE_OPS_ARM_FP32_GEMM_H_
 #define MACE_OPS_ARM_FP32_GEMM_H_
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/common/matrix.h"
+#include "mace/ops/delegator/gemm.h"
+#include "mace/public/mace.h"
 #include "mace/utils/math.h"
 
 // This implements matrix-matrix multiplication.
@@ -29,13 +30,12 @@ namespace ops {
 namespace arm {
 namespace fp32 {
 
-class Gemm {
+class Gemm : public delegator::Gemm {
  public:
-  explicit Gemm(const bool should_cache_pack)
-      : pack_cache_(GetCPUAllocator()),
-        should_cache_pack_(should_cache_pack),
+  explicit Gemm(const delegator::GemmParam &param)
+      : delegator::Gemm(param), pack_cache_(GetCPUAllocator()),
+        should_cache_pack_(param.should_cache_pack_),
         cached_(0) {}
-  Gemm() : Gemm(false) {}
   ~Gemm() {}
 
   MaceStatus Compute(
@@ -51,7 +51,7 @@ class Gemm {
       const MatrixMajor output_major,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 
   // Original matrix before transpose has row-major
   MaceStatus Compute(
@@ -68,7 +68,7 @@ class Gemm {
       const bool transpose_out,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 
  private:
   void ComputeBlock(const float *packed_lhs_data,
diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc
index 2f2866cf0da86dd70402d28810247821f229d85b..317e422404327f50b6874993a2ed10f76a000e87 100644
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
@@ -378,6 +378,10 @@ MaceStatus Gemv::Compute(const OpContext *context,
 #undef vaddvq_f32
 #endif
 
+
+MACE_REGISTER_DELEGATOR(registry, Gemv, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, float, NEON))
+
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/fp32/gemv.h b/mace/ops/arm/fp32/gemv.h
index 1f406426fbe93ae965f23450eca2a5ba1c517db1..9933cf42b817e20945517588a87dfca2232e7411 100644
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
@@ -15,18 +15,19 @@
 #ifndef MACE_OPS_ARM_FP32_GEMV_H_
 #define MACE_OPS_ARM_FP32_GEMV_H_
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
+#include "mace/ops/delegator/gemv.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
 
-class Gemv {
+class Gemv : public delegator::Gemv {
  public:
-  Gemv() {}
+  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
   ~Gemv() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -39,7 +40,7 @@ class Gemv {
       const index_t lhs_width,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc
index bdaa57a640ec6e6d66cd080830211b95c4ceb5b5..74d44104c422f555ee9e5b18ab5647aba9c7f2bd 100644
--- a/mace/ops/arm/q8/eltwise.cc
+++ b/mace/ops/arm/q8/eltwise.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/q8/eltwise.h"
-
 #include <arm_neon.h>
 #include <algorithm>
 
 #include "mace/ops/common/gemmlowp_util.h"
+#include "mace/ops/delegator/eltwise.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
@@ -25,6 +24,16 @@ namespace ops {
 namespace arm {
 namespace q8 {
 
+class Eltwise : public delegator::Eltwise {
+ public:
+  explicit Eltwise(const delegator::EltwiseParam &param)
+      : delegator::Eltwise(param) {}
+  ~Eltwise() = default;
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input0,
+                     const Tensor *input1, Tensor *output) override;
+};
+
 MaceStatus Eltwise::Compute(const OpContext *context,
                             const Tensor *input0,
                             const Tensor *input1,
@@ -144,7 +153,7 @@ MaceStatus Eltwise::Compute(const OpContext *context,
                   gemmlowp::SaturatingRoundingDoublingHighMul(
                       res, output_multiplier),
                   -output_shift) +
-              output->zero_point();
+                  output->zero_point();
           output_ptr[i] = Saturate<uint8_t>(output_val);
         }
       },
@@ -153,6 +162,9 @@ MaceStatus Eltwise::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam,
+                        MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, NEON))
+
 }  // namespace q8
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/q8/eltwise.h b/mace/ops/arm/q8/eltwise.h
deleted file mode 100644
index 200b13cb2769787a92c2d03da40f1b2e10d65900..0000000000000000000000000000000000000000
--- a/mace/ops/arm/q8/eltwise.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This implements matrix-vector multiplication described as
-// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
-
-#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_
-#define MACE_OPS_ARM_Q8_ELTWISE_H_
-
-#include "mace/core/op_context.h"
-#include "mace/core/types.h"
-#include "mace/ops/common/eltwise_type.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace q8 {
-
-class Eltwise {
- public:
-  explicit Eltwise(const EltwiseType type) : type_(type) {}
-
-  MaceStatus Compute(const OpContext *context,
-                     const Tensor *input0,
-                     const Tensor *input1,
-                     Tensor *output);
-
- private:
-  EltwiseType type_;
-};
-
-}  // namespace q8
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_Q8_ELTWISE_H_
diff --git a/mace/ops/arm/q8/gemv.cc b/mace/ops/arm/q8/gemv.cc
index 388c68147ff305cf603c95a62293024b7b1db03d..11290d357d0a33992ba52d3a5b8de31040a66738 100644
--- a/mace/ops/arm/q8/gemv.cc
+++ b/mace/ops/arm/q8/gemv.cc
@@ -181,6 +181,14 @@ class Gemv<uint8_t>;
 template
 class Gemv<int32_t>;
 
+typedef Gemv<uint8_t> GemvUint8;
+MACE_REGISTER_DELEGATOR(registry, GemvUint8, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, NEON))
+
+typedef Gemv<int32_t> GemvInt32;
+MACE_REGISTER_DELEGATOR(registry, GemvInt32, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, int32_t, NEON))
+
 }  // namespace q8
 }  // namespace arm
 }  // namespace ops
diff --git a/mace/ops/arm/q8/gemv.h b/mace/ops/arm/q8/gemv.h
index 21a275798a7dd9533c1645d606386aa89cf91a92..c9b98a07d3f50e5f5c26ff42caf791e9b6d38b67 100644
--- a/mace/ops/arm/q8/gemv.h
+++ b/mace/ops/arm/q8/gemv.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,15 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// This implements matrix-vector multiplication described as
-// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
-
 #ifndef MACE_OPS_ARM_Q8_GEMV_H_
 #define MACE_OPS_ARM_Q8_GEMV_H_
 
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
+#include "mace/ops/delegator/gemv.h"
 
 namespace mace {
 namespace ops {
@@ -28,11 +23,11 @@ namespace arm {
 namespace q8 {
 
 template<typename OUTPUT_TYPE>
-class Gemv {
+class Gemv : public delegator::Gemv {
  public:
-  Gemv() : is_output_type_uint8_(
-      DataTypeToEnum<OUTPUT_TYPE>::value == DataType::DT_UINT8) {
-  }
+  explicit Gemv(const DelegatorParam &param)
+      : delegator::Gemv(param), is_output_type_uint8_(
+      DataTypeToEnum<OUTPUT_TYPE>::value == DataType::DT_UINT8) {}
   ~Gemv() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -45,7 +40,7 @@ class Gemv {
       const index_t lhs_width,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 
  private:
   bool is_output_type_uint8_;
diff --git a/mace/ops/arm/q8/quantize.cc b/mace/ops/arm/q8/quantize.cc
index 9c80dcbc5ba3ac0f6c2770c9c5249ff8c70e73c8..4a8d402b2d859fbfb486eb7860d675a2320815ce 100644
--- a/mace/ops/arm/q8/quantize.cc
+++ b/mace/ops/arm/q8/quantize.cc
@@ -17,7 +17,8 @@
 #include <algorithm>
 #include <limits>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/quantize.h"
 
@@ -106,12 +107,12 @@ class DequantizeOp<DeviceType::CPU, T> : public Operation {
   QuantizeUtil<float, T> quantize_util_;
 };
 
-void RegisterQuantize(OpRegistryBase *op_registry) {
+void RegisterQuantize(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Quantize", QuantizeOp,
                    DeviceType::CPU, uint8_t);
 }
 
-void RegisterDequantize(OpRegistryBase *op_registry) {
+void RegisterDequantize(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Dequantize", DequantizeOp,
                    DeviceType::CPU, uint8_t);
   MACE_REGISTER_OP(op_registry, "Dequantize", DequantizeOp,
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index a27e46c5739428e6b08952db83f0dfce5b60e798..88c9a179fe2982b1ec38821dd850784d97953608 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -16,14 +16,10 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/activation.h"
-
-#if defined(MACE_ENABLE_NEON)
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/activation.h"
-#endif
+#include "mace/ops/delegator/activation.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -45,11 +41,16 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
         epsilon_(Operation::GetOptionalArg<float>("epsilon",
                                                   static_cast<float>(1e-4))),
         activation_delegator_(
-            ops::StringToActivationType(
-                Operation::GetOptionalArg<std::string>("activation", "NOOP")),
-            Operation::GetOptionalArg<float>("max_limit", 0.0f),
-            Operation::GetOptionalArg<float>(
-                "leakyrelu_coefficient", 0.0f)) {}
+            delegator::Activation::Create(
+                context->workspace(),
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
+                delegator::ActivationParam(
+                    ops::StringToActivationType(
+                        Operation::GetOptionalArg<std::string>("activation",
+                                                               "NOOP")),
+                    Operation::GetOptionalArg<float>("max_limit", 0.0f),
+                    Operation::GetOptionalArg<float>("leakyrelu_coefficient",
+                                                     0.0f)))) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -142,18 +143,14 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
       }, 0, batch, 1, 0, channels, 1);
     }
 
-    activation_delegator_.Compute(context, output, output);
+    activation_delegator_->Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
   float epsilon_;
-#ifdef MACE_ENABLE_NEON
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
 
  protected:
   MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
@@ -232,7 +229,7 @@ class BatchNormOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterBatchNorm(OpRegistryBase *op_registry) {
+void RegisterBatchNorm(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index 937387fc6be78587c0898a5ab5d00a3640b87d3b..90324cd76f0797ae0535b99c139f48ee58077a35 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -15,7 +15,8 @@
 #include <algorithm>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/batch_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -285,7 +286,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
+void RegisterBatchToSpaceND(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                    BatchToSpaceNDOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index c17a6e49dd5ab74302933d31fea4d07a197c4a8f..54a0f2710ad7ca8430e26d9661baf6a86b58c315 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -16,14 +16,10 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/activation.h"
-
-#ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp32/bias_add.h"
-#else
-#include "mace/ops/ref/bias_add.h"
-#endif  // MACE_ENABLE_NEON
+#include "mace/ops/delegator/bias_add.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -42,8 +38,11 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit BiasAddOp(OpConstructContext *context)
       : Operation(context),
-        has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
-                                                        0)) {}
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)),
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -56,7 +55,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
       MACE_CHECK(bias->dim_size() == 1 || bias->dim_size() == 2,
                  "bias must be 1-dimensional or n*c for caffee.",
                  MakeString(bias->shape()));
-      bias_add_delegator_.Compute(context, input, bias, output);
+      bias_add_delegator_->Compute(context, input, bias, output);
     } else {  // NHWC
       MACE_CHECK(bias->dim_size() == 1 || bias->dim_size() == 2,
                  "bias must be 1 or 2 dimensionals for caffee.",
@@ -115,11 +114,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 
  private:
   int has_data_format_;
-#ifdef MACE_ENABLE_NEON
-  arm::fp32::BiasAdd bias_add_delegator_;
-#else
-  ref::BiasAdd bias_add_delegator_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
@@ -164,7 +159,7 @@ class BiasAddOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterBiasAdd(OpRegistryBase *op_registry) {
+void RegisterBiasAdd(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
diff --git a/mace/ops/cast.cc b/mace/ops/cast.cc
index 940959a93f0333033e26a0825f28cf0f735f1bb3..dfa42a7600de0f7ebc0a4e6cc8dac7c12c783db8 100644
--- a/mace/ops/cast.cc
+++ b/mace/ops/cast.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
 #include <arm_neon.h>
@@ -54,7 +55,7 @@ class CastOp : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterCast(OpRegistryBase *op_registry) {
+void RegisterCast(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Cast", CastOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "Cast", CastOp,
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index a7fababb3e9a2806d4de0eb4b9d91600c4180a30..cddda38db323d70151093bcf9a84446f6f3cc5e4 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -14,7 +14,8 @@
 
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/channel_shuffle.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -98,7 +99,7 @@ class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterChannelShuffle(OpRegistryBase *op_registry) {
+void RegisterChannelShuffle(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ChannelShuffle",
                    ChannelShuffleOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/common/lstm.h b/mace/ops/common/lstm.h
index d9e4024894dba1a7c3995e8239ef0a9e814a50e9..a22094e59abcc3b4e7331e7103ad12a49229786d 100644
--- a/mace/ops/common/lstm.h
+++ b/mace/ops/common/lstm.h
@@ -15,8 +15,8 @@
 #ifndef MACE_OPS_COMMON_LSTM_H_
 #define MACE_OPS_COMMON_LSTM_H_
 
+#include "mace/core/ops/op_context.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/common/transpose.h b/mace/ops/common/transpose.h
index 6a70133c2a7513be3ee9efa52784ae00b4f09457..8ff72df6cdd99d4969622f952ccd452f0fa89fa1 100644
--- a/mace/ops/common/transpose.h
+++ b/mace/ops/common/transpose.h
@@ -20,7 +20,7 @@
 #endif  // MACE_ENABLE_NEON
 #include <algorithm>
 #include <vector>
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/public/mace.h"
 
 namespace mace {
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index aff95a2e552ed348faa59405713e2adae84ac8ea..65f05fdc63418d6a3e31cecd9700f6dd2055a02e 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -14,7 +14,8 @@
 
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/quantize.h"
 #include "mace/utils/memory.h"
 
@@ -221,7 +222,7 @@ class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterConcat(OpRegistryBase *op_registry) {
+void RegisterConcat(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index c2666d073c370240e3945f166b4ce18a9d9dc0ff..83da3f85c7185f2004248e5cd2ce3697c1ce58b1 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -24,32 +24,18 @@
 #include <vector>
 
 #include "mace/core/future.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/activation.h"
+#include "mace/ops/delegator/bias_add.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
 
-#ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp32/conv_2d.h"
-#include "mace/ops/arm/fp32/conv_2d_1x1.h"
-#include "mace/ops/arm/fp32/conv_2d_3x3.h"
-#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
-#include "mace/ops/arm/fp32/conv_2d_5x5.h"
-#include "mace/ops/arm/fp32/conv_2d_7x7.h"
-#include "mace/ops/arm/fp32/conv_2d_1xn.h"
-#include "mace/ops/arm/fp32/conv_general.h"
-#include "mace/ops/arm/fp32/bias_add.h"
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/activation.h"
-#include "mace/ops/ref/bias_add.h"
-#endif  // MACE_ENABLE_NEON
-
-#include "mace/ops/ref/conv_2d.h"
-
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/common/gemmlowp_util.h"
 #include "mace/ops/arm/q8/quantization_util.h"
@@ -72,13 +58,21 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
-        activation_delegator_(ops::StringToActivationType(
-            Operation::GetOptionalArg<std::string>("activation",
-                                                   "NOOP")),
-                              Operation::GetOptionalArg<float>("max_limit",
-                                                               0.0f),
-                              Operation::GetOptionalArg<float>(
-                                  "leakyrelu_coefficient", 0.0f)) {}
+        activation_delegator_(
+            delegator::Activation::Create(
+                context->workspace(),
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
+                delegator::ActivationParam(
+                    ops::StringToActivationType(
+                        Operation::GetOptionalArg<std::string>("activation",
+                                                               "NOOP")),
+                    Operation::GetOptionalArg<float>("max_limit", 0.0f),
+                    Operation::GetOptionalArg<float>("leakyrelu_coefficient",
+                                                     0.0f)))),
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
@@ -86,116 +80,100 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
     Tensor *output = this->Output(OUTPUT);
 
-#ifdef MACE_ENABLE_NEON
-    // the following params are used to decide which conv delegator to use
-    const index_t stride_h = strides_[0];
-    const index_t stride_w = strides_[1];
-    const index_t dilation_h = dilations_[0];
-    const index_t dilation_w = dilations_[1];
-    const index_t filter_h = filter->dim(2);
-    const index_t filter_w = filter->dim(3);
-    const index_t input_channels = input->dim(1);
-    const index_t channels = filter->dim(0);
-
-    // NOTE: delegator is fixed after first round of running,
-    // although winograd depends on input params.
-    // We do not support changeable filter for now.
     if (conv2d_delegator_ == nullptr) {
-      if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
-          && dilation_h == 1 && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 3 && filter_w == 3
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        if (input_channels >= 8 && channels >= 8) {
-          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3Winograd>(
-              paddings_, padding_type_);
-        } else {
-          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S1>(
-              paddings_, padding_type_);
+      std::string tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                              MACE_CPU_IMPL_TYPE, General);
+      if (MACE_CPU_IMPL_TYPE == NEON) {
+        // the following params are used to decide which conv delegator to use
+        const index_t stride_h = strides_[0];
+        const index_t stride_w = strides_[1];
+        const index_t dilation_h = dilations_[0];
+        const index_t dilation_w = dilations_[1];
+        const index_t filter_h = filter->dim(2);
+        const index_t filter_w = filter->dim(3);
+        const index_t input_channels = input->dim(1);
+        const index_t channels = filter->dim(0);
+        // NOTE: delegator is fixed after first round of running,
+        // although winograd depends on input params.
+        // We do not support changeable filter for now.
+        if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
+            && dilation_h == 1 && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K1x1);
+        } else if (filter_h == 3 && filter_w == 3
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          if (input_channels >= 8 && channels >= 8) {
+            tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3Winograd);
+          } else {
+            tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3S1);
+          }
+        } else if (filter_h == 3 && filter_w == 3
+            && stride_h == 2 && stride_w == 2 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K3x3S2);
+        } else if (filter_h == 5 && filter_w == 5
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K5x5S1);
+        } else if (filter_h == 7 && filter_w == 7
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x7S1);
+        } else if (filter_h == 7 && filter_w == 7
+            && stride_h == 2 && stride_w == 2 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x7S2);
+        } else if (filter_h == 7 && filter_w == 7
+            && stride_h == 3 && stride_w == 3 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x7S3);
+        } else if (filter_h == 1 && filter_w == 7
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K1x7S1);
+        } else if (filter_h == 7 && filter_w == 1
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x1S1);
+        } else if (filter_h == 1 && filter_w == 15
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K1x15S1);
+        } else if (filter_h == 15 && filter_w == 1
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K15x1S1);
         }
-      } else if (filter_h == 3 && filter_w == 3
-          && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S2>(
-            paddings_, padding_type_);
-      } else if (filter_h == 5 && filter_w == 5
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK5x5S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 7
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 7
-          && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S2>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 7
-          && stride_h == 3 && stride_w == 3 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S3>(
-            paddings_, padding_type_);
-      } else if (filter_h == 1 && filter_w == 7
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x7S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 1
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x1S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 1 && filter_w == 15
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x15S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 15 && filter_w == 1
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK15x1S1>(
-            paddings_, padding_type_);
-      } else {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dGeneral>(
-            strides_,
-            dilations_,
-            paddings_,
-            padding_type_);
       }
+      delegator::Conv2dParam param(strides_, dilations_,
+                                   paddings_, padding_type_);
+      conv2d_delegator_ = delegator::Conv2d::Create(context->workspace(),
+                                                    tag, param);
     }
 
     conv2d_delegator_->Compute(context, input, filter, output);
-#else
-    if (ref_conv2d_delegator_ == nullptr) {
-      ref_conv2d_delegator_ = make_unique<ref::Conv2d<float>>(strides_,
-                                                              dilations_,
-                                                              paddings_,
-                                                              padding_type_);
-    }
-    ref_conv2d_delegator_->Compute(context, input, filter, output);
-#endif
-
-    bias_add_delegator_.Compute(context, output, bias, output);
-    activation_delegator_.Compute(context, output, output);
+    bias_add_delegator_->Compute(context, output, bias, output);
+    activation_delegator_->Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-#ifdef MACE_ENABLE_NEON
-  std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
-  arm::fp32::BiasAdd bias_add_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  std::unique_ptr<ref::Conv2d<float>> ref_conv2d_delegator_;
-  ref::BiasAdd bias_add_delegator_;
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
+  std::unique_ptr<delegator::Conv2d> conv2d_delegator_;
 
  private:
   MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -518,7 +496,7 @@ class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterConv2D(OpRegistryBase *op_registry) {
+void RegisterConv2D(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h
index b5ad48aea307a138fbbea234b6f44465055817c4..8d96532d169bdafed2e15d2651ae1dc17a9819b2 100644
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
index acaa73f1cfe82834af09d098a7cfc2b12fe70880..5be823453ebd852ae24edbcdd1a33fa2893af03e 100644
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -14,7 +14,8 @@
 
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 #include "mace/utils/memory.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -132,7 +133,7 @@ class CropOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterCrop(OpRegistryBase *op_registry) {
+void RegisterCrop(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Crop", CropOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
diff --git a/mace/ops/cumsum.cc b/mace/ops/cumsum.cc
index 302fdfd585f4a16a7da42ebe1fd495c4f0ce9b6e..b1cb58f0b268da6df2b98397a3a4d005d7706f01 100644
--- a/mace/ops/cumsum.cc
+++ b/mace/ops/cumsum.cc
@@ -14,7 +14,8 @@
 
 #include <functional>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -141,7 +142,7 @@ class CumsumOp<DeviceType::CPU, T> : public Operation {
   bool checked_;
 };
 
-void RegisterCumsum(OpRegistryBase *op_registry) {
+void RegisterCumsum(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 98298e0c9b709e51c9c8bda1a260bdd6dc8ed6e5..1e68449bdf1b36d9cbf7566a19f03a3194821069 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -14,20 +14,6 @@
 
 #include "mace/ops/deconv_2d.h"
 
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#include "mace/ops/arm/fp32/deconv_2d_2x2.h"
-#include "mace/ops/arm/fp32/deconv_2d_3x3.h"
-#include "mace/ops/arm/fp32/deconv_2d_4x4.h"
-#include "mace/ops/arm/fp32/deconv_2d_general.h"
-#include "mace/ops/arm/fp32/bias_add.h"
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/bias_add.h"
-#include "mace/ops/ref/activation.h"
-#include "mace/ops/ref/deconv_2d.h"
-#endif
-
 #include <algorithm>
 #include <functional>
 #include <memory>
@@ -35,9 +21,13 @@
 #include <vector>
 
 #include "mace/core/future.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/activation.h"
+#include "mace/ops/delegator/bias_add.h"
+#include "mace/ops/delegator/deconv_2d.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
 
@@ -49,6 +39,10 @@
 namespace mace {
 namespace ops {
 
+namespace {
+const std::vector<int> kDeconv2dStrides = {1, 1};
+}
+
 template<DeviceType D, class T>
 class Deconv2dOp;
 
@@ -57,9 +51,16 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
  public:
   explicit Deconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context),
-        activation_delegator_(activation_,
-                              relux_max_limit_,
-                              leakyrelu_coefficient_) {}
+        activation_delegator_(
+            delegator::Activation::Create(
+                context->workspace(),
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
+                delegator::ActivationParam(activation_, relux_max_limit_,
+                                           leakyrelu_coefficient_))),
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
@@ -79,91 +80,67 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
 
-#ifdef MACE_ENABLE_NEON
-    const index_t kernel_h = filter->dim(2);
-    const index_t kernel_w = filter->dim(3);
 
-    bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_2x2_s2 = kernel_h == kernel_w && kernel_h == 2 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
+    if (deconv2d_delegator_ == nullptr) {
+      std::string tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              MACE_CPU_IMPL_TYPE, General);
+      if (MACE_CPU_IMPL_TYPE == NEON) {
+        const index_t kernel_h = filter->dim(2);
+        const index_t kernel_w = filter->dim(3);
 
-    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
+        bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 &&
+            strides_[0] == strides_[1] && strides_[0] == 1;
+        bool use_neon_2x2_s2 = kernel_h == kernel_w && kernel_h == 2 &&
+            strides_[0] == strides_[1] && strides_[0] == 2;
 
-    bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
+        bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
+            strides_[0] == strides_[1] && strides_[0] == 1;
+        bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
+            strides_[0] == strides_[1] && strides_[0] == 2;
 
-    if (deconv2d_delegator_ == nullptr) {
-      if (use_neon_2x2_s1) {
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK2x2S1>(
-            paddings_, padding_type_, model_type_);
-      } else if (use_neon_2x2_s2) {
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK2x2S2>(
-            paddings_, padding_type_, model_type_);
-      } else if (use_neon_3x3_s1) {
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK3x3S1>(
-            paddings_, padding_type_, model_type_);
-      } else if (use_neon_3x3_s2) {
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK3x3S2>(
-            paddings_, padding_type_, model_type_);
-      } else if (use_neon_4x4_s1) {
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK4x4S1>(
-            paddings_, padding_type_, model_type_);
-      } else if (use_neon_4x4_s2) {
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK4x4S2>(
-            paddings_, padding_type_, model_type_);
-      } else {
-        deconv2d_delegator_ =
-            make_unique<arm::fp32::Deconv2dGeneral>(strides_,
-                                                    std::vector<int>{1, 1},
-                                                    paddings_,
-                                                    padding_type_,
-                                                    model_type_);
+        bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
+            strides_[0] == strides_[1] && strides_[0] == 1;
+        bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
+            strides_[0] == strides_[1] && strides_[0] == 2;
+
+        if (use_neon_2x2_s1) {
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K2x2S1);
+        } else if (use_neon_2x2_s2) {
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K2x2S2);
+        } else if (use_neon_3x3_s1) {
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K3x3S1);
+        } else if (use_neon_3x3_s2) {
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K3x3S2);
+        } else if (use_neon_4x4_s1) {
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K4x4S1);
+        } else if (use_neon_4x4_s2) {
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K4x4S2);
+        }
       }
+      delegator::Deconv2dParam param(strides_, kDeconv2dStrides, paddings_,
+                                     padding_type_, model_type_);
+      deconv2d_delegator_ = delegator::Deconv2d::Create(context->workspace(),
+                                                        tag, param);
     }
-    deconv2d_delegator_->Compute(context,
-                                 input,
-                                 filter,
-                                 output_shape_tensor,
-                                 output);
-#else
-    if (deconv2d_delegator_ == nullptr) {
-      deconv2d_delegator_ = make_unique<ref::Deconv2d<float>>(strides_,
-                                                              std::vector<int>{
-                                                                  1, 1},
-                                                              paddings_,
-                                                              padding_type_,
-                                                              model_type_);
-    }
-    deconv2d_delegator_->Compute(context,
-                                 input,
-                                 filter,
-                                 output_shape_tensor,
-                                 output);
-
-#endif  // MACE_ENABLE_NEON
 
-    bias_add_delegator_.Compute(context, output, bias, output);
-    activation_delegator_.Compute(context, output, output);
+    deconv2d_delegator_->Compute(context, input, filter,
+                                 output_shape_tensor, output);
+    bias_add_delegator_->Compute(context, output, bias, output);
+    activation_delegator_->Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-#ifdef MACE_ENABLE_NEON
-  std::unique_ptr<arm::fp32::Deconv2dBase> deconv2d_delegator_;
-  arm::fp32::BiasAdd bias_add_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::BiasAdd bias_add_delegator_;
-  ref::Activation activation_delegator_;
-  std::unique_ptr<ref::Deconv2d<float>> deconv2d_delegator_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
+  std::unique_ptr<delegator::Deconv2d> deconv2d_delegator_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
@@ -258,7 +235,7 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterDeconv2D(OpRegistryBase *op_registry) {
+void RegisterDeconv2D(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h
index 50a2ecee5e8329ea24aa3fbae419823831d1b370..a11d5f8a8bd77a7be78605a6a256331d2ceccdd7 100644
--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/core/types.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
diff --git a/mace/ops/delegator/activation.h b/mace/ops/delegator/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..80a9c6b376fceda5d84d2de4eb7358213df9613b
--- /dev/null
+++ b/mace/ops/delegator/activation.h
@@ -0,0 +1,61 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_DELEGATOR_ACTIVATION_H_
+#define MACE_OPS_DELEGATOR_ACTIVATION_H_
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+#include "mace/ops/common/activation_type.h"
+
+namespace mace {
+namespace ops {
+namespace delegator {
+
+struct ActivationParam : public DelegatorParam {
+  explicit ActivationParam(ActivationType type, const float limit,
+                           const float leakyrelu_coefficient)
+      : type_(type), limit_(limit),
+        leakyrelu_coefficient_(leakyrelu_coefficient) {}
+
+  ActivationType type_;
+  const float limit_;
+  const float leakyrelu_coefficient_;
+};
+
+class Activation : public OpDelegator {
+ public:
+  explicit Activation(const ActivationParam &param)
+      : OpDelegator(param), type_(param.type_), limit_(param.limit_),
+        leakyrelu_coefficient_(param.leakyrelu_coefficient_) {}
+  virtual ~Activation() = default;
+
+  MACE_DEFINE_DELEGATOR_CREATOR(Activation)
+
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *input,
+                             Tensor *output) = 0;
+
+ protected:
+  ActivationType type_;
+  const float limit_;
+  const float leakyrelu_coefficient_;
+};
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_ACTIVATION_H_
diff --git a/mace/ops/delegator/bias_add.h b/mace/ops/delegator/bias_add.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5fdea0deea984cf2450d2f17cd29c6913a35bd9
--- /dev/null
+++ b/mace/ops/delegator/bias_add.h
@@ -0,0 +1,43 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_DELEGATOR_BIAS_ADD_H_
+#define MACE_OPS_DELEGATOR_BIAS_ADD_H_
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+
+namespace mace {
+namespace ops {
+namespace delegator {
+
+class BiasAdd : public OpDelegator {
+ public:
+  explicit BiasAdd(const DelegatorParam &param) : OpDelegator(param) {}
+  virtual ~BiasAdd() = default;
+
+  MACE_DEFINE_DELEGATOR_CREATOR(BiasAdd)
+
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *input,
+                             const Tensor *bias,
+                             Tensor *output) = 0;
+};
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_BIAS_ADD_H_
diff --git a/mace/ops/delegator/conv_2d.h b/mace/ops/delegator/conv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ff85f6dacd1123cfbd02a12f90990c6750d5c37
--- /dev/null
+++ b/mace/ops/delegator/conv_2d.h
@@ -0,0 +1,90 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_DELEGATOR_CONV_2D_H_
+#define MACE_OPS_DELEGATOR_CONV_2D_H_
+
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+
+enum ConvType {
+  General,
+  K1x1,
+  K1x7S1,
+  K7x1S1,
+  K1x15S1,
+  K15x1S1,
+  K3x3S1,
+  K3x3S2,
+  K3x3Winograd,
+  K5x5S1,
+  K7x7S1,
+  K7x7S2,
+  K7x7S3,
+};
+
+namespace delegator {
+
+struct Conv2dParam : public DelegatorParam {
+  explicit Conv2dParam(const std::vector<int> &strides,
+                       const std::vector<int> &dilations,
+                       const std::vector<int> &paddings,
+                       const Padding padding_type)
+      : strides_(strides), dilations_(dilations),
+        paddings_(paddings), padding_type_(padding_type) {}
+
+  const std::vector<int> &strides_;
+  const std::vector<int> &dilations_;
+  const std::vector<int> &paddings_;
+  const Padding padding_type_;
+};
+
+class Conv2d : public OpDelegator {
+ public:
+  explicit Conv2d(const delegator::Conv2dParam &param)
+      : OpDelegator(param),
+        strides_(param.strides_),
+        dilations_(param.dilations_),
+        paddings_(param.paddings_),
+        padding_type_(param.padding_type_) {}
+  virtual ~Conv2d() = default;
+
+  MACE_DEFINE_DELEGATOR_CREATOR(Conv2d)
+
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             Tensor *output) = 0;
+
+ protected:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+};
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_CONV_2D_H_
+
diff --git a/mace/ops/delegator/deconv_2d.h b/mace/ops/delegator/deconv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..856f3595bcd37b86dc3c65d2c48a70a4901f3b47
--- /dev/null
+++ b/mace/ops/delegator/deconv_2d.h
@@ -0,0 +1,95 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_DELEGATOR_DECONV_2D_H_
+#define MACE_OPS_DELEGATOR_DECONV_2D_H_
+
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+
+namespace mace {
+namespace ops {
+
+enum DeconvType {
+  General,
+  K2x2S1,
+  K2x2S2,
+  K3x3S1,
+  K3x3S2,
+  K4x4S1,
+  K4x4S2,
+};
+
+namespace delegator {
+
+struct Deconv2dParam : public DelegatorParam {
+  explicit Deconv2dParam(const std::vector<int> &strides,
+                         const std::vector<int> &dilations,
+                         const std::vector<int> &paddings,
+                         const Padding padding_type,
+                         const FrameworkType framework_type,
+                         const int group = 1)
+      : strides_(strides), dilations_(dilations),
+        paddings_(paddings), padding_type_(padding_type),
+        framework_type_(framework_type),
+        group_(group) {}
+
+  const std::vector<int> &strides_;
+  const std::vector<int> &dilations_;
+  const std::vector<int> &paddings_;
+  const Padding padding_type_;
+  const FrameworkType framework_type_;
+  const int group_;
+};
+
+class Deconv2d : public OpDelegator {
+ public:
+  explicit Deconv2d(const Deconv2dParam &param)
+      : OpDelegator(param),
+        strides_(param.strides_),
+        dilations_(param.dilations_),
+        paddings_(param.paddings_),
+        padding_type_(param.padding_type_),
+        framework_type_(param.framework_type_),
+        group_(param.group_) {}
+
+  virtual ~Deconv2d() = default;
+
+  MACE_DEFINE_DELEGATOR_CREATOR(Deconv2d)
+
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *output_shape,
+                             Tensor *output) = 0;
+
+ protected:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+  const FrameworkType framework_type_;
+  const int group_;
+};
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_DECONV_2D_H_
+
diff --git a/mace/ops/ref/bias_add.h b/mace/ops/delegator/depthwise_conv_2d.h
similarity index 52%
rename from mace/ops/ref/bias_add.h
rename to mace/ops/delegator/depthwise_conv_2d.h
index f3dc6096e0ae409d0a4b226ebd21b04d6e0228b5..c586839bbcdb3a3d42a2a200fffaaf2e40a9432d 100644
--- a/mace/ops/ref/bias_add.h
+++ b/mace/ops/delegator/depthwise_conv_2d.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,35 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_REF_BIAS_ADD_H_
-#define MACE_OPS_REF_BIAS_ADD_H_
 
-#include "mace/core/op_context.h"
+#ifndef MACE_OPS_DELEGATOR_DEPTHWISE_CONV_2D_H_
+#define MACE_OPS_DELEGATOR_DEPTHWISE_CONV_2D_H_
+
+#include "mace/ops/delegator/conv_2d.h"
 
 namespace mace {
 namespace ops {
-namespace ref {
-
-class BiasAdd {
- public:
-  BiasAdd() = default;
-  ~BiasAdd() = default;
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *bias,
-      Tensor *output);
-
- private:
-  void AddBias(const OpContext *context,
-               const Tensor *input,
-               const Tensor *bias,
-               Tensor *output);
-};
-
-}  // namespace ref
+namespace delegator {
+
+typedef Conv2dParam DepthwiseConv2dParam;
+typedef Conv2d DepthwiseConv2d;
+
+}  // namespace delegator
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_REF_BIAS_ADD_H_
+#endif  // MACE_OPS_DELEGATOR_DEPTHWISE_CONV_2D_H_
+
diff --git a/mace/ops/delegator/depthwise_deconv_2d.h b/mace/ops/delegator/depthwise_deconv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..10616f1b10f7470fefecf58a23713aaf9c168709
--- /dev/null
+++ b/mace/ops/delegator/depthwise_deconv_2d.h
@@ -0,0 +1,35 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_DELEGATOR_DEPTHWISE_DECONV_2D_H_
+#define MACE_OPS_DELEGATOR_DEPTHWISE_DECONV_2D_H_
+
+#include "mace/ops/delegator/deconv_2d.h"
+namespace mace {
+namespace ops {
+namespace delegator {
+
+typedef Deconv2dParam DepthwiseDeconv2dParam;
+typedef Deconv2dParam GroupDeconv2dParam;
+
+typedef Deconv2d DepthwiseDeconv2d;
+typedef Deconv2d GroupDeconv2d;
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_DEPTHWISE_DECONV_2D_H_
+
diff --git a/mace/ops/delegator/eltwise.h b/mace/ops/delegator/eltwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe66f35462270535bfda14fb50b907e3309dee6b
--- /dev/null
+++ b/mace/ops/delegator/eltwise.h
@@ -0,0 +1,57 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This implements matrix-vector multiplication described as
+// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
+
+#ifndef MACE_OPS_DELEGATOR_ELTWISE_H_
+#define MACE_OPS_DELEGATOR_ELTWISE_H_
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+#include "mace/core/types.h"
+#include "mace/ops/common/eltwise_type.h"
+
+namespace mace {
+namespace ops {
+namespace delegator {
+
+struct EltwiseParam : public DelegatorParam {
+  explicit EltwiseParam(EltwiseType type)
+      : type_(type) {}
+
+  EltwiseType type_;
+};
+
+class Eltwise : public OpDelegator {
+ public:
+  explicit Eltwise(const EltwiseParam &param) : OpDelegator(param),
+                                                type_(param.type_) {}
+  virtual ~Eltwise() = default;
+
+  MACE_DEFINE_DELEGATOR_CREATOR(Eltwise)
+
+  virtual MaceStatus Compute(const OpContext *context, const Tensor *input0,
+                             const Tensor *input1, Tensor *output) = 0;
+
+ protected:
+  EltwiseType type_;
+};
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_ELTWISE_H_
diff --git a/mace/ops/delegator/gemm.h b/mace/ops/delegator/gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..29043c3b27260740e0c924a2ec4dbd6fda52b666
--- /dev/null
+++ b/mace/ops/delegator/gemm.h
@@ -0,0 +1,77 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_DELEGATOR_GEMM_H_
+#define MACE_OPS_DELEGATOR_GEMM_H_
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+#include "mace/ops/common/matrix.h"
+
+namespace mace {
+namespace ops {
+namespace delegator {
+
+struct GemmParam : public DelegatorParam {
+  explicit GemmParam(const bool should_cache_pack = false)
+      : should_cache_pack_(should_cache_pack) {}
+
+  const bool should_cache_pack_;
+};
+
+class Gemm : public OpDelegator {
+ public:
+  explicit Gemm(const GemmParam &param) : OpDelegator(param) {}
+  virtual ~Gemm() = default;
+
+  MACE_DEFINE_DELEGATOR_CREATOR(Gemm)
+
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *lhs,
+                             const Tensor *rhs,
+                             const index_t batch,
+                             const index_t rows,
+                             const index_t cols,
+                             const index_t depth,
+                             const MatrixMajor lhs_major,
+                             const MatrixMajor rhs_major,
+                             const MatrixMajor output_major,
+                             const bool lhs_batched,
+                             const bool rhs_batched,
+                             Tensor *output) = 0;
+  // Original matrix before transpose has row-major
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *lhs,
+                             const Tensor *rhs,
+                             const index_t batch,
+                             const index_t lhs_rows,
+                             const index_t lhs_cols,
+                             const index_t rhs_rows,
+                             const index_t rhs_cols,
+                             const bool transpose_lhs,
+                             const bool transpose_rhs,
+                             const bool transpose_out,
+                             const bool lhs_batched,
+                             const bool rhs_batched,
+                             Tensor *output) = 0;
+};
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_GEMM_H_
+
diff --git a/mace/ops/delegator/gemv.h b/mace/ops/delegator/gemv.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bdde1820463b140a5a8ba003f91529fd12956bb
--- /dev/null
+++ b/mace/ops/delegator/gemv.h
@@ -0,0 +1,52 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_DELEGATOR_GEMV_H_
+#define MACE_OPS_DELEGATOR_GEMV_H_
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+
+namespace mace {
+namespace ops {
+namespace delegator {
+
+class Gemv : public OpDelegator {
+ public:
+  explicit Gemv(const DelegatorParam &param) : OpDelegator(param) {}
+  virtual ~Gemv() = default;
+
+  MACE_DEFINE_DELEGATOR_CREATOR(Gemv)
+
+  // Always row-major after transpose
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *lhs,
+                             const Tensor *rhs,
+                             const Tensor *bias,
+                             const index_t batch,
+                             const index_t lhs_height,
+                             const index_t lhs_width,
+                             const bool lhs_batched,
+                             const bool rhs_batched,
+                             Tensor *output) = 0;
+};
+
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_DELEGATOR_GEMV_H_
+
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index 6efa4d24566972164fd39d848d037f8c850e12e2..9484fdde2964952389e3402d2ffb7323076a153e 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -15,7 +15,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/depth_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -184,7 +185,7 @@ class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterDepthToSpace(OpRegistryBase *op_registry) {
+void RegisterDepthToSpace(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "DepthToSpace",
                    DepthToSpaceOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 06964ee038088d6921b5d9244eac3c14913522ae..23cf8e046fa82edbab28cbddfb57a99d721c61ac 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -17,17 +17,6 @@
 #include <string>
 #include <vector>
 
-#include "mace/ops/ref/depthwise_conv_2d.h"
-
-#if defined(MACE_ENABLE_NEON)
-#include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h"
-#include "mace/ops/arm/fp32/bias_add.h"
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/activation.h"
-#include "mace/ops/ref/bias_add.h"
-#endif  // MACE_ENABLE_NEON
-
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/arm/q8/quantization_util.h"
 // We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it
@@ -36,9 +25,13 @@
 #endif  // MACE_ENABLE_QUANTIZE
 
 #include "mace/core/future.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/conv_pool_2d_base.h"
+#include "mace/ops/delegator/activation.h"
+#include "mace/ops/delegator/bias_add.h"
+#include "mace/ops/delegator/depthwise_conv_2d.h"
 #include "mace/public/mace.h"
 #include "mace/utils/memory.h"
 #include "mace/core/quantize.h"
@@ -75,9 +68,16 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
  public:
   explicit DepthwiseConv2dOp(OpConstructContext *context)
       : DepthwiseConv2dOpBase(context),
-        activation_delegator_(activation_,
-                              relux_max_limit_,
-                              leakyrelu_coefficient_) {}
+        activation_delegator_(
+            delegator::Activation::Create(
+                context->workspace(),
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
+                delegator::ActivationParam(activation_, relux_max_limit_,
+                                           leakyrelu_coefficient_))),
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -92,67 +92,44 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
 
-#ifdef MACE_ENABLE_NEON
-    const index_t filter_h = filter->dim(2);
-    const index_t filter_w = filter->dim(3);
-    const index_t stride_h = strides_[0];
-    const index_t stride_w = strides_[1];
-    const index_t dilation_h = dilations_[0];
-    const index_t dilation_w = dilations_[1];
-
-    if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
-        && dilation_h == 1 && dilation_w == 1) {
-      if (conv2d_delegator_.get() == nullptr) {
-        conv2d_delegator_ =
-            make_unique<arm::fp32::DepthwiseConv2dK3x3S1>(paddings_,
-                                                          padding_type_);
-      }
-      conv2d_delegator_->Compute(context, input, filter, output);
-    } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
-        && dilation_h == 1 && dilation_w == 1) {
-      if (conv2d_delegator_.get() == nullptr) {
-        conv2d_delegator_ =
-            make_unique<arm::fp32::DepthwiseConv2dK3x3S2>(paddings_,
-                                                          padding_type_);
-      }
-      conv2d_delegator_->Compute(context, input, filter, output);
-    } else {
-      if (ref_conv2d_delegator_.get() == nullptr) {
-        ref_conv2d_delegator_ =
-            make_unique<ref::DepthwiseConv2d<float>>(strides_,
-                                                     dilations_,
-                                                     paddings_,
-                                                     padding_type_);
+    if (depthwise_conv2d_delegator_ == nullptr) {
+      std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
+                                              REF, General);
+      if (MACE_CPU_IMPL_TYPE == NEON) {
+        const index_t filter_h = filter->dim(2);
+        const index_t filter_w = filter->dim(3);
+        const index_t stride_h = strides_[0];
+        const index_t stride_w = strides_[1];
+        const index_t dilation_h = dilations_[0];
+        const index_t dilation_w = dilations_[1];
+        if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
+            && dilation_h == 1 && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K3x3S1);
+        } else if (filter_h == 3 && filter_w == 3 && stride_h == 2
+            && stride_w == 2
+            && dilation_h == 1 && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K3x3S2);
+        }
       }
-      ref_conv2d_delegator_->Compute(context, input, filter, output);
-    }
-#else
-    if (ref_conv2d_delegator_.get() == nullptr) {
-      ref_conv2d_delegator_ =
-          make_unique<ref::DepthwiseConv2d<float>>(strides_,
-                                                   dilations_,
-                                                   paddings_,
-                                                   padding_type_);
+      delegator::Conv2dParam param(strides_, dilations_,
+                                   paddings_, padding_type_);
+      depthwise_conv2d_delegator_ = delegator::DepthwiseConv2d::Create(
+          context->workspace(), tag, param);
     }
-    ref_conv2d_delegator_->Compute(context, input, filter, output);
-#endif  // MACE_ENABLE_NEON
 
-    bias_add_delegator_.Compute(context, output, bias, output);
-    activation_delegator_.Compute(context, output, output);
+    depthwise_conv2d_delegator_->Compute(context, input, filter, output);
+    bias_add_delegator_->Compute(context, output, bias, output);
+    activation_delegator_->Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-#ifdef MACE_ENABLE_NEON
-  std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
-  arm::fp32::BiasAdd bias_add_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::BiasAdd bias_add_delegator_;
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
-  std::unique_ptr<ref::DepthwiseConv2d<float>> ref_conv2d_delegator_;
+  std::unique_ptr<delegator::Activation> activation_delegator_;
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
+  std::unique_ptr<delegator::DepthwiseConv2d> depthwise_conv2d_delegator_;
 
  protected:
   MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -422,7 +399,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
+void RegisterDepthwiseConv2d(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
                    DepthwiseConv2dOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index 96f6d575fd2c8663d7c2c860dbbdbd7d0801713d..f09261d6541b4b771baa1a2fe1ac85fad49e5b7d 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -12,33 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/deconv_2d.h"
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h"
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h"
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h"
-#include "mace/ops/arm/fp32/bias_add.h"
-#include "mace/ops/arm/fp32/activation.h"
-
-#else
-#include "mace/ops/ref/depthwise_deconv_2d.h"
-#include "mace/ops/ref/bias_add.h"
-#include "mace/ops/ref/activation.h"
-#endif
-
 #include <algorithm>
 #include <functional>
 #include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
-#include "mace/utils/math.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/deconv_2d.h"
+#include "mace/ops/delegator/activation.h"
+#include "mace/ops/delegator/bias_add.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
+#include "mace/utils/math.h"
 #include "mace/utils/memory.h"
-#include "mace/ops/common/conv_pool_2d_util.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -48,6 +37,10 @@
 namespace mace {
 namespace ops {
 
+namespace {
+const std::vector<int> kDepthwiseStrides = {1, 1};
+}
+
 template<DeviceType D, class T>
 class DepthwiseDeconv2dOp;
 
@@ -57,9 +50,16 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
  public:
   explicit DepthwiseDeconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context),
-        activation_delegator_(activation_,
-                              relux_max_limit_,
-                              leakyrelu_coefficient_) {}
+        activation_delegator_(
+            delegator::Activation::Create(
+                context->workspace(),
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
+                delegator::ActivationParam(activation_, relux_max_limit_,
+                                           leakyrelu_coefficient_))),
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
@@ -74,113 +74,77 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
     const index_t in_channels = input->dim(1);
     bool is_depthwise = group_ == in_channels;
 
-#ifdef MACE_ENABLE_NEON
-    const index_t kernel_h = filter->dim(2);
-    const index_t kernel_w = filter->dim(3);
-    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
-    bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
-
-    if (deconv2d_delegator_ == nullptr) {
-      if (is_depthwise) {
-        if (use_neon_3x3_s1) {
-          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK3x3S1>(
-              paddings_, padding_type_, CAFFE);
-        } else if (use_neon_3x3_s2) {
-          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK3x3S2>(
-              paddings_, padding_type_, CAFFE);
-        } else if (use_neon_4x4_s1) {
-          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK4x4S1>(
-              paddings_, padding_type_, CAFFE);
-        } else if (use_neon_4x4_s2) {
-          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK4x4S2>(
-              paddings_, padding_type_, CAFFE);
-        } else {
-          deconv2d_delegator_ =
-              make_unique<arm::fp32::DepthwiseDeconv2dGeneral>(
-                  strides_,
-                  std::vector<int>{1, 1},
-                  paddings_,
-                  padding_type_,
-                  CAFFE);
-        }
-      } else {
-        if (use_neon_3x3_s1) {
-          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK3x3S1>(
-              paddings_, padding_type_, group_, CAFFE);
-        } else if (use_neon_3x3_s2) {
-          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK3x3S2>(
-              paddings_, padding_type_, group_, CAFFE);
-        } else if (use_neon_4x4_s1) {
-          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK4x4S1>(
-              paddings_, padding_type_, group_, CAFFE);
-        } else if (use_neon_4x4_s2) {
-          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK4x4S2>(
-              paddings_, padding_type_, group_, CAFFE);
+    if (depthwise_deconv2d_delegator_ == nullptr) {
+      if (MACE_CPU_IMPL_TYPE == NEON) {
+        const index_t kernel_h = filter->dim(2);
+        const index_t kernel_w = filter->dim(3);
+        bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
+            strides_[0] == strides_[1] && strides_[0] == 1;
+        bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
+            strides_[0] == strides_[1] && strides_[0] == 2;
+        bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
+            strides_[0] == strides_[1] && strides_[0] == 1;
+        bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
+            strides_[0] == strides_[1] && strides_[0] == 2;
+
+        if (is_depthwise) {
+          std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
+                                                  MACE_CPU_IMPL_TYPE, General);
+          if (use_neon_3x3_s1) {
+            tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3S1);
+          } else if (use_neon_3x3_s2) {
+            tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3S2);
+          } else if (use_neon_4x4_s1) {
+            tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K4x4S1);
+          } else if (use_neon_4x4_s2) {
+            tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K4x4S2);
+          }
+          delegator::DepthwiseDeconv2dParam param(strides_, kDepthwiseStrides,
+                                                  paddings_, padding_type_,
+                                                  CAFFE, group_);
+          depthwise_deconv2d_delegator_ = delegator::DepthwiseDeconv2d::Create(
+              context->workspace(), tag, param);
         } else {
-          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dGeneral>(
-              strides_,
-              std::vector<int>{1, 1},
-              paddings_,
-              padding_type_,
-              group_,
-              CAFFE);
+          std::string tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
+                                                  MACE_CPU_IMPL_TYPE, General);
+          if (use_neon_3x3_s1) {
+            tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3S1);
+          } else if (use_neon_3x3_s2) {
+            tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3S2);
+          } else if (use_neon_4x4_s1) {
+            tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K4x4S1);
+          } else if (use_neon_4x4_s2) {
+            tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K4x4S2);
+          }
+          delegator::GroupDeconv2dParam param(strides_, kDepthwiseStrides,
+                                              paddings_, padding_type_,
+                                              CAFFE, group_);
+          depthwise_deconv2d_delegator_ = delegator::GroupDeconv2d::Create(
+              context->workspace(), tag, param);
         }
       }
     }
 
-    deconv2d_delegator_->Compute(context,
-                                 input,
-                                 filter,
-                                 nullptr,
-                                 output);
-#else
-    if (deconv2d_delegator_ == nullptr) {
-      if (is_depthwise) {
-        deconv2d_delegator_ = make_unique<ref::DepthwiseDeconv2d<float>>(
-            strides_,
-            std::vector<int>{1, 1},
-            paddings_,
-            padding_type_,
-            CAFFE);
-      } else {
-        deconv2d_delegator_ = make_unique<ref::GroupDeconv2d<float>>(
-            strides_,
-            std::vector<int>{1, 1},
-            paddings_,
-            padding_type_,
-            group_,
-            CAFFE);
-      }
-    }
-    deconv2d_delegator_->Compute(context,
-                                 input,
-                                 filter,
-                                 nullptr,
-                                 output);
-#endif
-
-    bias_add_delegator_.Compute(context, output, bias, output);
-    activation_delegator_.Compute(context, output, output);
+    depthwise_deconv2d_delegator_->Compute(context, input, filter,
+                                           nullptr, output);
+    bias_add_delegator_->Compute(context, output, bias, output);
+    activation_delegator_->Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-#ifdef MACE_ENABLE_NEON
-  std::unique_ptr<arm::fp32::Deconv2dBase> deconv2d_delegator_;
-  arm::fp32::BiasAdd bias_add_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  std::unique_ptr<ref::GroupDeconv2d<float>> deconv2d_delegator_;
-  ref::BiasAdd bias_add_delegator_;
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
+  std::unique_ptr<delegator::DepthwiseDeconv2d> depthwise_deconv2d_delegator_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
@@ -251,7 +215,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
+void RegisterDepthwiseDeconv2d(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
                    DepthwiseDeconv2dOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/dynamic_lstm.cc b/mace/ops/dynamic_lstm.cc
index fc226c08d112edea0e13d19cc44aa76c1432ea7f..014f23c00d41fe283bc21c23d17bb5b53825fdee 100644
--- a/mace/ops/dynamic_lstm.cc
+++ b/mace/ops/dynamic_lstm.cc
@@ -35,14 +35,13 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/common/lstm.h"
+#include "mace/ops/delegator/gemv.h"
 
 #ifdef MACE_ENABLE_NEON
 #include <arm_neon.h>
-#include "mace/ops/arm/fp32/gemv.h"
-#else
-#include "mace/ops/ref/gemv.h"
 #endif  // MACE_ENABLE_NEON
 
 namespace mace {
@@ -73,7 +72,11 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
         cell_cache_indexes_(
             Operation::GetRepeatedArgs<index_t>("cell_cache_indexes")),
         out_cache_indexes_(
-            Operation::GetRepeatedArgs<index_t>("out_cache_indexes")) {}
+            Operation::GetRepeatedArgs<index_t>("out_cache_indexes")),
+        gemv_(delegator::Gemv::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Gemv, CPU, T, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   inline void Validate() {
     const Tensor *input = this->Input(0);
@@ -93,7 +96,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
                ") and prev_out_delay(", prev_out_delay_,
                ") should be less than zero.");
     MACE_CHECK(prev_cell_delay_ % subsample_factor_ == 0 &&
-               prev_out_delay_ % subsample_factor_ == 0,
+        prev_out_delay_ % subsample_factor_ == 0,
                "prev_cell_delay(", prev_cell_delay_,
                ") and prev_out_delay(", prev_out_delay_,
                ") should be multiples of subsample_factor(",
@@ -190,8 +193,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
     const index_t affine_a_out_dim = weights_a->dim(0);
     const index_t affine_a_depth = weights_a->dim(1);
     MACE_CHECK(affine_a_in_dim == affine_a_depth)
-      << "affine_a's input_dim:" << affine_a_in_dim
-      << "!=" << "affine_a's weights' depth:" << affine_a_depth << std::endl;
+        << "affine_a's input_dim:" << affine_a_in_dim
+        << "!=" << "affine_a's weights' depth:" << affine_a_depth << std::endl;
 
     const index_t lstm_input_dim = affine_a_out_dim + prev_cell_dim_;
     const index_t lstm_cell_dim = lstm_input_dim / 5;
@@ -202,15 +205,15 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
                lstm_cell_dim, ").");
     MACE_CHECK(lstm_params->dim(0) == 3 &&
         params_stride == lstm_cell_dim && lstm_cell_dim == prev_cell_dim_)
-      << " lstm params rows: " << lstm_params->dim(0)
-      << " params_stride: " << params_stride
-      << " != " << " cell_dim: " << lstm_cell_dim << std::endl;
+        << " lstm params rows: " << lstm_params->dim(0)
+        << " params_stride: " << params_stride
+        << " != " << " cell_dim: " << lstm_cell_dim << std::endl;
     const index_t affine_b_out_dim = weights_b->dim(0);
     const index_t affine_b_depth = weights_b->dim(1);
     const index_t affine_b_in_dim = lstm_cell_dim;
     MACE_CHECK(affine_b_in_dim == affine_b_depth)
-      << "affine_b's input_dim:" << affine_b_in_dim
-      << "!=" << "affine_b's weights' depth:" << affine_b_depth << std::endl;
+        << "affine_b's input_dim:" << affine_b_in_dim
+        << "!=" << "affine_b's weights' depth:" << affine_b_depth << std::endl;
 
     const index_t output_dim = affine_b_out_dim;
     MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim)
@@ -316,16 +319,16 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
                prev_out_buf_data + i % out_buf_chunk * prev_out_dim_,
                prev_out_dim_ * sizeof(float));
         // Affine
-        gemv_.Compute(context,
-                      weights_a,
-                      &affine_a_in,
-                      bias_a,
-                      1,
-                      affine_a_out_dim,
-                      affine_a_depth,
-                      false,
-                      false,
-                      &affine_a_out);
+        gemv_->Compute(context,
+                       weights_a,
+                       &affine_a_in,
+                       bias_a,
+                       1,
+                       affine_a_out_dim,
+                       affine_a_depth,
+                       false,
+                       false,
+                       &affine_a_out);
         // Prepare LSTMNonlinear input and output pointer
         float *lstm_cell_ptr =
             prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_;
@@ -343,16 +346,16 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
                             affine_b_in_data);
         UpdateCell(curr_cell_ptr, prev_cell_dim_, scale_);
         // Affine
-        gemv_.Compute(context,
-                      weights_b,
-                      &affine_b_in,
-                      bias_b,
-                      1,
-                      affine_b_out_dim,
-                      affine_b_depth,
-                      false,
-                      false,
-                      &affine_b_out);
+        gemv_->Compute(context,
+                       weights_b,
+                       &affine_b_in,
+                       bias_b,
+                       1,
+                       affine_b_out_dim,
+                       affine_b_depth,
+                       false,
+                       false,
+                       &affine_b_out);
         // Output
         memcpy(output_ptr,
                affine_b_out_data,
@@ -404,18 +407,13 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
   std::vector<index_t> forward_indexes_;
   std::vector<index_t> cell_cache_indexes_;
   std::vector<index_t> out_cache_indexes_;
-
-#ifdef MACE_ENABLE_NEON
-  arm::fp32::Gemv gemv_;
-#else
-  ref::Gemv<float> gemv_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Gemv> gemv_;
 
   MACE_OP_INPUT_TAGS(INPUT, PREV_OUT, PREV_CELL, WEIGHTS_A, PARAMS, WEIGHTS_B);
   MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_CACHE, CELL_CACHE);
 };
 
-void RegisterDynamicLSTM(OpRegistryBase *op_registry) {
+void RegisterDynamicLSTM(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 7db7b6c0c6c59bebab78840c6316fb120908ed01..e4d5a74b9bf518e10de8d499924733e38edafff6 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef MACE_ENABLE_NEON
 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/arm/q8/eltwise.h"
+#include "mace/ops/delegator/eltwise.h"
 #endif  // MACE_ENABLE_QUANTIZE
-#endif  // MACE_ENABLE_NEON
 
 #include "mace/ops/eltwise.h"
 
@@ -28,7 +26,8 @@
 #include <vector>
 
 #include "mace/core/future.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/utils/memory.h"
 #include "mace/core/quantize.h"
@@ -1061,7 +1060,7 @@ class EltwiseOp : public Operation {
 };
 
 #ifdef MACE_ENABLE_QUANTIZE
-template <>
+template<>
 class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
  public:
   explicit EltwiseOp(OpConstructContext *context)
@@ -1071,12 +1070,15 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
         coeff_(Operation::GetRepeatedArgs<float>("coeff")),
         scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
-            "scalar_input_index", 1))
-#ifdef MACE_ENABLE_NEON
-        , eltwise_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(ops::EltwiseType::NONE))))
-#endif
-  {}
+            "scalar_input_index", 1)),
+        eltwise_delegator_(delegator::Eltwise::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, MACE_CPU_IMPL_TYPE),
+            delegator::EltwiseParam(
+                static_cast<ops::EltwiseType>(
+                    Operation::GetOptionalArg<int>(
+                        "type",
+                        static_cast<int>(ops::EltwiseType::NONE)))))) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -1092,77 +1094,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
     MACE_CHECK(output->scale() != 0);
     MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
 
-#ifdef MACE_ENABLE_NEON
-    eltwise_.Compute(context, input0, input1, output);
-#else
-    constexpr int left_shift = 20;
-    const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
-    const double adjusted_input0_scale = input0->scale() / doubled_scale;
-    const double adjusted_input1_scale = input1->scale() / doubled_scale;
-    const double adjusted_output_scale =
-        doubled_scale / ((1 << left_shift) * output->scale());
-
-    int32_t input0_multiplier;
-    int32_t input1_multiplier;
-    int32_t output_multiplier;
-    int32_t input0_shift;
-    int32_t input1_shift;
-    int32_t output_shift;
-    QuantizeMultiplier(adjusted_input0_scale,
-                       &input0_multiplier,
-                       &input0_shift);
-    QuantizeMultiplier(adjusted_input1_scale,
-                       &input1_multiplier,
-                       &input1_shift);
-    QuantizeMultiplier(adjusted_output_scale,
-                       &output_multiplier,
-                       &output_shift);
-
-    Tensor::MappingGuard input0_guard(input0);
-    Tensor::MappingGuard input1_guard(input1);
-    Tensor::MappingGuard output_guard(output);
-
-    auto input0_ptr = input0->data<uint8_t>();
-    auto input1_ptr = input1->data<uint8_t>();
-    auto output_ptr = output->mutable_data<uint8_t>();
-
-    utils::ThreadPool
-        &thread_pool = context->device()->cpu_runtime()->thread_pool();
-    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
-      for (index_t i = start; i < end; i += step) {
-        const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
-        const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
-        const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
-        const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
-        const int32_t multiplied_input0 =
-            gemmlowp::RoundingDivideByPOT(
-                gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
-                                                            input0_multiplier),
-                -input0_shift);
-        const int32_t multiplied_input1 =
-            gemmlowp::RoundingDivideByPOT(
-                gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
-                                                            input1_multiplier),
-                -input1_shift);
-
-        int32_t res;
-        if (type_ == SUM) {
-          res = multiplied_input0 + multiplied_input1;
-        } else {
-          res = multiplied_input0 - multiplied_input1;
-        }
-
-        const int32_t output_val =
-            gemmlowp::RoundingDivideByPOT(
-                gemmlowp::SaturatingRoundingDoublingHighMul(res,
-                                                            output_multiplier),
-                -output_shift) + output->zero_point();
-        output_ptr[i] = Saturate<uint8_t>(output_val);
-      }
-    }, 0, output->size(), 1);
-#endif  // NEON
-
-    return MaceStatus::MACE_SUCCESS;
+    return eltwise_delegator_->Compute(context, input0, input1, output);
   }
 
  private:
@@ -1171,9 +1103,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
   float scalar_input_;
   int32_t scalar_input_index_;
   Tensor scalar_tensor_;
-#ifdef MACE_ENABLE_NEON
-  arm::q8::Eltwise eltwise_;
-#endif
+  std::unique_ptr<delegator::Eltwise> eltwise_delegator_;
 };
 #endif  // MACE_ENABLE_QUANTIZE
 
@@ -1244,7 +1174,7 @@ class EltwiseOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterEltwise(OpRegistryBase *op_registry) {
+void RegisterEltwise(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc
index 5474dd4bc26f50836271a2073be7e5f28f1f0ffe..cc3426c3cab7e27a3cb4965d362c147acaf7a428 100644
--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 
 namespace mace {
@@ -53,7 +54,7 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
   int axis_;
 };
 
-void RegisterExpandDims(OpRegistryBase *op_registry) {
+void RegisterExpandDims(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/extract_pooling.cc b/mace/ops/extract_pooling.cc
index 87264f4f66ff04c2bd0c17959450cf9add9532de..765fc58ebc6b4fb2c92286cc9651e2c239e04649 100644
--- a/mace/ops/extract_pooling.cc
+++ b/mace/ops/extract_pooling.cc
@@ -26,7 +26,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 
 namespace mace {
@@ -176,7 +177,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
   std::vector<float> counts_;
 };
 
-void RegisterExtractPooling(OpRegistryBase *op_registry) {
+void RegisterExtractPooling(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ExtractPooling", ExtractPoolingOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/fill.cc b/mace/ops/fill.cc
index 32a8595dcef36cf352722b38b4ef84e8a0f6ca34..0917674b18c854609617e4e6690c74542b23dc7e 100644
--- a/mace/ops/fill.cc
+++ b/mace/ops/fill.cc
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -61,7 +62,7 @@ class FillOp<DeviceType::CPU, float> : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterFill(OpRegistryBase *op_registry) {
+void RegisterFill(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Fill", FillOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index d863a2843a493d3186021d6621f226fc89689e7b..b037488837e679b8fbf47a8363f5e17c9d4bca42 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -17,22 +17,12 @@
 #include <vector>
 
 #include "mace/core/future.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
-
-#ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp32/gemv.h"
-#include "mace/ops/arm/fp32/activation.h"
-
-#ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/arm/q8/gemv.h"
-#endif  // MACE_ENABLE_QUANTIZE
-
-#else
-#include "mace/ops/ref/gemv.h"
-#include "mace/ops/ref/activation.h"
-#endif  // MACE_ENABLE_NEON
+#include "mace/ops/delegator/activation.h"
+#include "mace/ops/delegator/gemv.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -71,9 +61,16 @@ class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
  public:
   explicit FullyConnectedOp(OpConstructContext *context)
       : FullyConnectedOpBase(context),
-        activation_delegator_(activation_,
-                              relux_max_limit_,
-                              leakyrelu_coefficient_) {}
+        activation_delegator_(delegator::Activation::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
+            delegator::ActivationParam(activation_,
+                                       relux_max_limit_,
+                                       leakyrelu_coefficient_))),
+        gemv_(delegator::Gemv::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -100,30 +97,25 @@ class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
     const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
     const index_t output_size = weight->dim(0);
 
-    gemv_.Compute(context,
-                  weight,
-                  input,
-                  bias,
-                  batch,
-                  output_size,
-                  input_size,
-                  false,
-                  true,
-                  output);
+    gemv_->Compute(context,
+                   weight,
+                   input,
+                   bias,
+                   batch,
+                   output_size,
+                   input_size,
+                   false,
+                   true,
+                   output);
 
-    activation_delegator_.Compute(context, output, output);
+    activation_delegator_->Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-#ifdef MACE_ENABLE_NEON
-  arm::fp32::Gemv gemv_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::Gemv<float> gemv_;
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
+  std::unique_ptr<delegator::Gemv> gemv_;
 };
 
 #ifdef MACE_ENABLE_QUANTIZE
@@ -132,7 +124,11 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
     : public FullyConnectedOpBase {
  public:
   explicit FullyConnectedOp(OpConstructContext *context)
-      : FullyConnectedOpBase(context) {}
+      : FullyConnectedOpBase(context),
+        gemv_(delegator::Gemv::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
@@ -161,7 +157,7 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
     const int input_size =
         static_cast<int>(weight->dim(1) * weight->dim(2) * weight->dim(3));
     const int output_size = static_cast<int>(weight->dim(0));
-    gemv_.Compute(context,
+    gemv_->Compute(context,
                   weight,
                   input,
                   bias,
@@ -175,11 +171,7 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
   }
 
  private:
-#ifdef MACE_ENABLE_NEON
-  ::mace::ops::arm::q8::Gemv<uint8_t> gemv_;
-#else
-  ref::Gemv<uint8_t> gemv_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Gemv> gemv_;
 };
 #endif  // MACE_ENABLE_QUANTIZE
 
@@ -231,7 +223,7 @@ class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterFullyConnected(OpRegistryBase *op_registry) {
+void RegisterFullyConnected(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "FullyConnected",
                    FullyConnectedOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc
index 2114290b66ff8d2d256bc7e9dcce02b298331112..a112d91f94a24b9e8be455e727e8cf87f8c46e6c 100644
--- a/mace/ops/gather.cc
+++ b/mace/ops/gather.cc
@@ -14,7 +14,8 @@
 
 #include <algorithm>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -85,7 +86,7 @@ class GatherOp : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterGather(OpRegistryBase *op_registry) {
+void RegisterGather(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/identity.cc b/mace/ops/identity.cc
index 1c7a037ee2b8c1ec445b8c638958209cde7792f0..ac915cd848558300b8cd59770f663e0a2e856727 100644
--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -32,7 +33,7 @@ class IdentityOp : public Operation {
   }
 };
 
-void RegisterIdentity(OpRegistryBase *op_registry) {
+void RegisterIdentity(OpRegistry *op_registry) {
   MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
                             DeviceType::CPU, float);
   MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
diff --git a/mace/ops/ifdefined.cc b/mace/ops/ifdefined.cc
index f0367d20f08d76250bb426da24d5882e6229ab48..84a2831609bec4a4c5ef455834f29812f30848ec 100644
--- a/mace/ops/ifdefined.cc
+++ b/mace/ops/ifdefined.cc
@@ -25,7 +25,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -162,7 +163,7 @@ class IfDefinedOp<DeviceType::CPU, T> : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterIfDefined(OpRegistryBase *op_registry) {
+void RegisterIfDefined(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc
index fb7bfecc90ccb80d2cedaf321d65b207be988892..f29056fec26989b363e532440da982c55866e1eb 100644
--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
@@ -101,7 +102,7 @@ class InferConv2dShapeOp : public Operation {
   }
 };
 
-void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
+void RegisterInferConv2dShape(OpRegistry *op_registry) {
   MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
                             InferConv2dShapeOp, DeviceType::CPU, float);
   MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
diff --git a/mace/ops/kaldi_batch_norm.cc b/mace/ops/kaldi_batch_norm.cc
index 61c0340cc9abf41bbf224f60402b59d4241eadb7..ed05064faabe77db2feeef4f1fcb24a35fb5970c 100644
--- a/mace/ops/kaldi_batch_norm.cc
+++ b/mace/ops/kaldi_batch_norm.cc
@@ -19,7 +19,8 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -167,7 +168,7 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterKaldiBatchNorm(OpRegistryBase *op_registry) {
+void RegisterKaldiBatchNorm(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc
index 022ee3e7aa979ee36794f0fe6c4888012a0f0cb2..2ade126c8e7deba122dddfe4eff19d6b4bbc50bf 100644
--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -15,7 +15,8 @@
 #include <algorithm>
 #include <cmath>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -91,7 +92,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
   float beta_;
 };
 
-void RegisterLocalResponseNorm(OpRegistryBase *op_registry) {
+void RegisterLocalResponseNorm(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "LocalResponseNorm",
                    LocalResponseNormOp, DeviceType::CPU, float);
 }
diff --git a/mace/ops/lpnorm.cc b/mace/ops/lpnorm.cc
index 2c62ac194f688788502b1f8be19505ad87ab4402..a5c68a3575931911478461ffd802b16f5e8b79fb 100644
--- a/mace/ops/lpnorm.cc
+++ b/mace/ops/lpnorm.cc
@@ -16,7 +16,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/lpnorm.h"
@@ -147,7 +148,7 @@ class LpNormOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterLpNorm(OpRegistryBase *op_registry) {
+void RegisterLpNorm(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "LpNorm", LpNormOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "LpNorm", LpNormOp);
diff --git a/mace/ops/lstm_nonlinear.cc b/mace/ops/lstm_nonlinear.cc
index fbf92c16e4361623d41dfbb50e704a4d8a81021e..c975ae62da40b549105bf936653e8ebaa07694c3 100644
--- a/mace/ops/lstm_nonlinear.cc
+++ b/mace/ops/lstm_nonlinear.cc
@@ -18,7 +18,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/common/lstm.h"
 
 namespace mace {
@@ -100,7 +101,7 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterLSTMNonlinear(OpRegistryBase *op_registry) {
+void RegisterLSTMNonlinear(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
index 1c97279e90f3ccd5792c1ea866729ef0842b9bb4..75e278708514aa94c1783bde7bd9bd228d46a242 100644
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -19,25 +19,18 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
+#include "mace/ops/delegator/gemm.h"
+#include "mace/ops/delegator/gemv.h"
 #include "mace/utils/math.h"
 
-#ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/arm/fp32/gemv.h"
-
 #ifdef MACE_ENABLE_QUANTIZE
+#include "mace/ops/common/gemmlowp_util.h"
+#ifdef MACE_ENABLE_NEON
 #include "mace/ops/arm/q8/gemv.h"
-#endif  // MACE_ENABLE_QUANTIZE
-
-#else
-#include "mace/ops/ref/gemm.h"
-#include "mace/ops/ref/gemv.h"
 #endif  // MACE_ENABLE_NEON
-
-#ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/common/gemmlowp_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
@@ -103,7 +96,15 @@ template<>
 class MatMulOp<CPU, float> : public MatMulOpBase {
  public:
   explicit MatMulOp(OpConstructContext *context)
-      : MatMulOpBase(context) {}
+      : MatMulOpBase(context),
+        gemm_(delegator::Gemm::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Gemm, CPU, float, MACE_CPU_IMPL_TYPE),
+            delegator::GemmParam())),
+        gemv_(delegator::Gemv::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
 
   MaceStatus Run(OpContext *context) override {
     Validate();
@@ -154,43 +155,43 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
     MACE_RETURN_IF_ERROR(C->Resize(output_shape));
 
     if (rows == 1 && transpose_b_) {
-      return gemv_.Compute(context,
-                           rhs,
-                           lhs,
-                           bias,
-                           batch,
-                           cols,
-                           depth,
-                           rhs_batched,
-                           lhs_batched,
-                           C);
+      return gemv_->Compute(context,
+                            rhs,
+                            lhs,
+                            bias,
+                            batch,
+                            cols,
+                            depth,
+                            rhs_batched,
+                            lhs_batched,
+                            C);
     } else if (cols == 1 && !transpose_a_) {
-      return gemv_.Compute(context,
-                           lhs,
-                           rhs,
-                           bias,
-                           batch,
-                           rows,
-                           depth,
-                           lhs_batched,
-                           rhs_batched,
-                           C);
+      return gemv_->Compute(context,
+                            lhs,
+                            rhs,
+                            bias,
+                            batch,
+                            rows,
+                            depth,
+                            lhs_batched,
+                            rhs_batched,
+                            C);
     } else {
       context->device()->scratch_buffer()->Rewind();
-      MaceStatus ret = gemm_.Compute(context,
-                                     lhs,
-                                     rhs,
-                                     batch,
-                                     lhs_rows,
-                                     lhs_cols,
-                                     rhs_rows,
-                                     rhs_cols,
-                                     transpose_a_,
-                                     transpose_b_,
-                                     false,
-                                     lhs_batched,
-                                     rhs_batched,
-                                     C);
+      MaceStatus ret = gemm_->Compute(context,
+                                      lhs,
+                                      rhs,
+                                      batch,
+                                      lhs_rows,
+                                      lhs_cols,
+                                      rhs_rows,
+                                      rhs_cols,
+                                      transpose_a_,
+                                      transpose_b_,
+                                      false,
+                                      lhs_batched,
+                                      rhs_batched,
+                                      C);
       if (bias != nullptr) {
         MACE_CHECK(bias->dim_size() == 1 && bias->dim(0) == cols,
                    "bias' dim should be <= 2.");
@@ -217,13 +218,8 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
   }
 
  private:
-#ifdef MACE_ENABLE_NEON
-  arm::fp32::Gemm gemm_;
-  arm::fp32::Gemv gemv_;
-#else
-  ref::Gemv<float> gemv_;
-  ref::Gemm<float> gemm_;
-#endif  // MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Gemm> gemm_;
+  std::unique_ptr<delegator::Gemv> gemv_;
 };
 
 #ifdef MACE_ENABLE_QUANTIZE
@@ -234,6 +230,10 @@ class MatMulFixpointImpl;
 template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
 class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
  public:
+#ifdef MACE_ENABLE_NEON
+  MatMulFixpointImpl<AOrder, BOrder, uint8_t>()
+      : gemv_kernel_(DelegatorParam()) {}
+#endif  // MACE_ENABLE_NEON
   void operator()(OpContext *context,
                   const Tensor *A,
                   const Tensor *B,
@@ -318,6 +318,10 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
 template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
 class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
  public:
+#ifdef MACE_ENABLE_NEON
+  MatMulFixpointImpl<AOrder, BOrder, int32_t>()
+      : gemv_kernel_(DelegatorParam()) {}
+#endif  // MACE_ENABLE_NEON
   void operator()(OpContext *context,
                   const Tensor *A,
                   const Tensor *B,
@@ -592,7 +596,7 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
 };
 #endif  // MACE_ENABLE_FP16_NEON
 
-void RegisterMatMul(OpRegistryBase *op_registry) {
+void RegisterMatMul(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/mvnorm.cc b/mace/ops/mvnorm.cc
index ccb0018a5881d1341de931bead67a41a367af985..09d3bb9a3cf0239c68ad857e698c16d5d89439e7 100644
--- a/mace/ops/mvnorm.cc
+++ b/mace/ops/mvnorm.cc
@@ -16,7 +16,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/mvnorm.h"
@@ -165,7 +166,7 @@ class MVNormOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterMVNorm(OpRegistryBase *op_registry) {
+void RegisterMVNorm(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "MVNorm", MVNormOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_GPU_OP(op_registry, "MVNorm", MVNormOp);
diff --git a/mace/ops/one_hot.cc b/mace/ops/one_hot.cc
index 1596286af6ae4af96e5e7d01194fa5eff7e235a2..77d18bca3b7635b794c29d0b5a21ae7219876fad 100644
--- a/mace/ops/one_hot.cc
+++ b/mace/ops/one_hot.cc
@@ -15,7 +15,8 @@
 #include <vector>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -144,7 +145,7 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
 };
 
 
-void RegisterOneHot(OpRegistryBase *op_registry) {
+void RegisterOneHot(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU, float);
 }
 
diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h
index 25415877e676707aab857fd09e81d4821ae99361..5b47bdc7403a222f3806e6309d12f868dacc3de4 100644
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc
index 688ded664fa7dac533fd7fbafcfc7d1d8fbf9cdc..e86c460874552c3a8e8d56ee3eea13a1f3f73d1d 100644
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc
index 95c85b17dd24438a8c9bd45c974b7c23c46c85be..001c201d29281f66dbb8bc46c27b3a779114387b 100644
--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc
index 4c03ee2af0c5b5452878db16067fff114088884c..9e7d75089b03d6d45a4f293b80105e3c5ac6a2d3 100644
--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/buffer/reshape.cc b/mace/ops/opencl/buffer/reshape.cc
index ae3c119c2368d4c57d2151a641472d508999151b..73f78777c948ddd4a8f536be2d54e03ab19e9679 100644
--- a/mace/ops/opencl/buffer/reshape.cc
+++ b/mace/ops/opencl/buffer/reshape.cc
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h
index 0acae465953c75fc6d053b8d6c90040a17f75818..5555ad61787a0145f282b75decf08813b08ffdb0 100644
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/buffer/utils.h b/mace/ops/opencl/buffer/utils.h
index e68fcb4a274e28900a440454fb34cfacfa1e2941..10d0dea1314be85d45329d9b6bbf0f63b27ceb5b 100644
--- a/mace/ops/opencl/buffer/utils.h
+++ b/mace/ops/opencl/buffer/utils.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_BUFFER_UTILS_H_
 
 #include "mace/core/future.h"
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
 
diff --git a/mace/ops/opencl/buffer_transform.cc b/mace/ops/opencl/buffer_transform.cc
index fc1d9dcc2c514d289baa3f56bced871723e778fc..1cacaccb0abc9694c7432f915e03b4954f3a350b 100644
--- a/mace/ops/opencl/buffer_transform.cc
+++ b/mace/ops/opencl/buffer_transform.cc
@@ -14,7 +14,8 @@
 
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/opencl/buffer_transformer.h"
 
 namespace mace {
@@ -51,7 +52,7 @@ class BufferTransformOp<DeviceType::GPU, float> : public Operation {
   MemoryType out_mem_type_;
 };
 
-void RegisterBufferTransform(OpRegistryBase *op_registry) {
+void RegisterBufferTransform(OpRegistry *op_registry) {
   MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
 }
 
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index 0dcec529674a12ea54c56342c4730aed0b244c99..987507de3b2e260dfe755807607f92f780eaabc7 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -19,7 +19,8 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/opencl/image/buffer_to_image.h"
 #include "mace/ops/opencl/image/image_to_buffer.h"
 #include "mace/ops/opencl/buffer/buffer_transform.h"
diff --git a/mace/ops/opencl/image/activation.h b/mace/ops/opencl/image/activation.h
index 929d267ddd2860161c45eb63b3be465e870298ed..bfbdc47c8cecb0dd10aa12dbcc17b6231fc9178a 100644
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h
index 575dee22764af5e856ec19792f5fce60634f906b..ee2c526b486e0dc67bc584a0d1f732f6ee0aec30 100644
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h
index 6b7773682ff546753b75f2f94f0fb2282a0b39fc..9a93b534188cd658322ce0fcda42a1d97419f611 100644
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -21,7 +21,7 @@
 #include <set>
 #include <string>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h
index a9d047aa2a7af096b535f2086afe9450beed46c3..ade029b6e2fb756ad9b0842ce29321785d6a751b 100644
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h
index 67644d6a1f58b99fc0c3d4d2d4021c1e2e178adb..3430c81fd31bd0b92f7454ff8d24f2544aa9a6d4 100644
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
index 3389118279f3cdf7c8050e1bb5fd17c9e154530d..5a332f6a77d6220e7584b9658ea7416d7dd5b05e 100644
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h
index 94448d805ccb86887bb1b9e12bce0cfba66db4a4..016b60e00bce11fa37c99b35c4ae8604f004013a 100644
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/concat.h b/mace/ops/opencl/image/concat.h
index e5cd297779f7adb583653e31d25aa5816a377d4f..de9ee72fca7e1f5b8907f8f03dc0fdf64261c648 100644
--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -19,7 +19,7 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h
index 6044c1a7235535cc0f67dcdc716b25189ed7a3d4..1ecd913137891542c11117ee54f437877e655971 100644
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -19,7 +19,7 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
index 494672a4447cf0ed9e8611e11a241f9cc1387816..2d4baa5bbcd5123a2542bb9db1cb1a871f7a6e9c 100644
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index 8bfc988c8ebf4057b9a2942f632594d14cfcf7d0..c5ea2890751f9cf9ca5a7455d6abd35fe323f98e 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index 9964c5f25cba4b39e401ac39764bd6d29b6f62f1..b84d83949d26235da4c51a135f4965ca6f8cfe3a 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/common/activation_type.h"
diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
index 33a5d2603e56e297b0c0271ad806009b38550a07..c6b9ca8a4425e9aa29a4d12388a4b9e91eebf0dd 100644
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
index 4f1db7e66fa4580690bd648c259543dce292083d..2ab385046f49ed629fa0b90d15b8d1b9416f5e59 100644
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h
index 383a4c6f3fed98c2b4cec5b36121004a78a0109f..ee56b6eae3152e3c13c76d7035a95193176608db 100644
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -21,7 +21,7 @@
 #include <set>
 #include <string>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h
index c72170acdb1c15ebf27dbd327d64b5b73d40de2e..fc8833ddf6e842a6a6f4529822d7270457e76768 100644
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -19,7 +19,7 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h
index fe039cb679c449f0d432b86531d17795cb3e83e6..4643a9c1f46bc50b7d3cafa3e93649854113617f 100644
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
index a9298cc6582e4e5f8d805c1a0d00f9f65e99de0b..38c3dfe61428e868a490895848be58c6c0e35543 100644
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -22,7 +22,7 @@
 #include <set>
 #include <string>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/eltwise_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h
index 010edcac9979c659e6d926e076d941d9fea426dd..46a93a6173a90e926a316a8f299df6b5e7f118ee 100644
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
index 5d5c524884c0ccb6ce976ee8cd45d345c445e20d..f6484e2f48bc3f16ff34aec359352d0076d54458 100644
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/ops/opencl/buffer_transform_kernel.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/lpnorm.h b/mace/ops/opencl/image/lpnorm.h
index cac641125d99d4e93495d67b45e00f0f27bb3c7c..d500c66d14228d68e27f6cb415eb7dfa068af3b5 100644
--- a/mace/ops/opencl/image/lpnorm.h
+++ b/mace/ops/opencl/image/lpnorm.h
@@ -14,7 +14,7 @@
 #ifndef MACE_OPS_OPENCL_IMAGE_LPNORM_H_
 #define MACE_OPS_OPENCL_IMAGE_LPNORM_H_
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h
index 998d8147675c2dd1b3ade1b782055a86117aea83..f224ba07d95f469bfb1b7fe718e059a8eabcd498 100644
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -21,7 +21,7 @@
 #include <set>
 #include <string>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h
index 8ee05239b798d8c8b6f660fa4aea335ded3549b7..f9e3125d01449e476cb39a5c9055fe01754ba87b 100644
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/mvnorm.h b/mace/ops/opencl/image/mvnorm.h
index f6e609d27240612a0c53141ce409790b6b826234..5752167e2157808b96e4da080b4bccbfa6d1934b 100644
--- a/mace/ops/opencl/image/mvnorm.h
+++ b/mace/ops/opencl/image/mvnorm.h
@@ -17,7 +17,7 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h
index 3df88f34e31020a848ee34d9c958cf8bc0200b32..6c04c7c8a9c9b25dea9345450732784915959061 100644
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -21,7 +21,7 @@
 #include <set>
 #include <string>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/pad_type.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h
index 5c0e14a52b544e65af82bfd05bcc2a939e9d2a1b..8f0e0c062f4be9b70febf6491a98ff68e057051d 100644
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -22,7 +22,7 @@
 #include <set>
 #include <string>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h
index 0dfb48b427a25df89e475e45873d0ec69197f95a..24e889d73384ee98e7db35a4a8045e297805a452 100644
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/common/reduce_type.h"
diff --git a/mace/ops/opencl/image/reshape.h b/mace/ops/opencl/image/reshape.h
index 60be5fe0272c8827ce95003613ba0e07ab025396..3ee6bf297ba88aa12d5482d1262af1da1dd6366e 100644
--- a/mace/ops/opencl/image/reshape.h
+++ b/mace/ops/opencl/image/reshape.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/opencl/buffer_transform_kernel.h"
 
diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h
index 5abc553974e0c3fb1a4c2056ec140baf70e736cd..aab813691e8ab4dbc4cc5c84c361135a8758936a 100644
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h
index ca3602d33942da03de3aa3f3cb093513af74a324..a428a81395a6d9fdf130ad4ff48184052c6c3d6e 100644
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/resize_nearest_neighbor.h b/mace/ops/opencl/image/resize_nearest_neighbor.h
index 8bb10d4b2fd56046a689beae3e9abb3f0671f05e..1092665ee2ef32a8b1443259d4fcb8215b70f443 100644
--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h
index 525f1edc51ee8bc7637a2c9c83ffa876d67ab4b2..1873cd164f13630c593dc14245f3a04befd470ee 100644
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h
index 20777dc88453bc1746aab4e50c2c20f98babecec..f1001b2f7a7cfaf2de8cb9ad3fd4b131a3f46033 100644
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h
index 661e09af222ebf8ae07082d4192878d8e4703f36..6abb330f8e199cabeadd97131879e1ccf09113ef 100644
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h
index 20e1936207dca72126efba0a1b80a3bafa149012..0d2eaff260e280595017cdbe9752508452e314d6 100644
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -22,7 +22,7 @@
 #include <set>
 #include <string>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h
index 5acddb2556946b42ad1062ce6ec8c7bcf255e2cf..740fc03658c3594445bee5d2320a127faca86d63 100644
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
 
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
index fd7cdfe67f1f37b4f1701d77d28f0759829594dc..539b4cf4f8604261dbc79d8536e84bcc3f9596d0 100644
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/core/runtime/opencl/opencl_helper.h"
diff --git a/mace/ops/opencl/lstm_cell.cc b/mace/ops/opencl/lstm_cell.cc
index ce45c84401f89d42762c8a2c2bccbb57c35c08e1..dbdc2650d875a4a0caa60aaab9495cece1b9c26b 100644
--- a/mace/ops/opencl/lstm_cell.cc
+++ b/mace/ops/opencl/lstm_cell.cc
@@ -17,7 +17,8 @@
 #include <algorithm>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/lstm_cell.h"
 #include "mace/utils/memory.h"
@@ -89,7 +90,7 @@ class LSTMCellOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterLSTMCell(OpRegistryBase *op_registry) {
+void RegisterLSTMCell(OpRegistry *op_registry) {
   MACE_REGISTER_GPU_OP(op_registry, "LSTMCell", LSTMCellOp);
 }
 
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index 49784c10db2c999b07faffe927aa6d6ebb061746..b210f40e87f3acc1712b92acf5ed4d6a7a161e5f 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -15,7 +15,8 @@
 #include <vector>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/common/pad_type.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/pad.h"
@@ -198,7 +199,7 @@ class PadOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterPad(OpRegistryBase *op_registry) {
+void RegisterPad(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Pad", PadOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/pad_context.cc b/mace/ops/pad_context.cc
index 25117df2562cc8a6d45ef70929a919ce8f9da0de..02a8c4250922a59b4d72e273b6e3ad6a82913e76 100644
--- a/mace/ops/pad_context.cc
+++ b/mace/ops/pad_context.cc
@@ -18,7 +18,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 
 namespace mace {
@@ -83,7 +84,7 @@ class PadContextOp<DeviceType::CPU, T> : public Operation {
   int right_context_;
 };
 
-void RegisterPadContext(OpRegistryBase *op_registry) {
+void RegisterPadContext(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "PadContext", PadContextOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/pnorm.cc b/mace/ops/pnorm.cc
index 1d0d6698604834fdd58fb390171d21d0976780ec..588e59745404b9252bda70e6e2ac0ef192a839f5 100644
--- a/mace/ops/pnorm.cc
+++ b/mace/ops/pnorm.cc
@@ -26,7 +26,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -132,7 +133,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
   int output_dim_;
 };
 
-void RegisterPNorm(OpRegistryBase *op_registry) {
+void RegisterPNorm(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "PNorm", PNormOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 4d4247f2b7236a0a3270c7d30a413c2885ca8256..2d51c1c4c64eb1a2274c2c6fd44d1965a66242c5 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -22,7 +22,8 @@
 #include <vector>
 
 #include "mace/core/future.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
@@ -510,7 +511,7 @@ class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterPooling(OpRegistryBase *op_registry) {
+void RegisterPooling(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/prior_box.cc b/mace/ops/prior_box.cc
index 62040d272d4eb7ba46ba8b6d3bc20db401f9c644..3598c98a8b98d882d82f89c9b1fc8063b3258a56 100644
--- a/mace/ops/prior_box.cc
+++ b/mace/ops/prior_box.cc
@@ -18,7 +18,8 @@
 #include <utility>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -144,7 +145,7 @@ class PriorBoxOp : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterPriorBox(OpRegistryBase *op_registry) {
+void RegisterPriorBox(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "PriorBox", PriorBoxOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index 7c34db3e6a06fe89f84a5d980afc9da67585d584..a81a602d9be90b2eece8f2ca96f93609b1317b78 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -19,7 +19,8 @@
 
 #include "mace/ops/common/reduce_type.h"
 #include "mace/core/future.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
 #include "mace/core/tensor.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -1032,7 +1033,7 @@ class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterReduce(OpRegistryBase *op_registry) {
+void RegisterReduce(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
diff --git a/mace/ops/ref/activation.cc b/mace/ops/ref/activation.cc
index 4e2e65dbe71ef5b0e243a2be7d7803028de1f8d8..da2ff26fabd940d0a5e1822df2d37486344cfcd7 100644
--- a/mace/ops/ref/activation.cc
+++ b/mace/ops/ref/activation.cc
@@ -13,18 +13,26 @@
 // limitations under the License.
 
 #include <algorithm>
-#include "mace/ops/ref/activation.h"
+
+#include "mace/ops/delegator/activation.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
-Activation::Activation(ActivationType type,
-                       const float limit,
-                       const float leakyrelu_coefficient)
-    : type_(type),
-      limit_(limit),
-      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+class Activation : public delegator::Activation {
+ public:
+  explicit Activation(const delegator::ActivationParam &param)
+      : delegator::Activation(param) {}
+  ~Activation() = default;
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     Tensor *output) override;
+
+ private:
+  void DoActivation(const OpContext *context, const Tensor *input,
+                    Tensor *output);
+};
 
 MaceStatus Activation::Compute(const OpContext *context,
                                const Tensor *input,
@@ -99,6 +107,9 @@ void Activation::DoActivation(const OpContext *context,
   }
 }
 
+MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam,
+                        MACE_DELEGATOR_KEY(Activation, CPU, float, REF))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/activation.h b/mace/ops/ref/activation.h
deleted file mode 100644
index 7ad986a50ceed14b021abf2a4d81f2bb7b336e19..0000000000000000000000000000000000000000
--- a/mace/ops/ref/activation.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_REF_ACTIVATION_H_
-#define MACE_OPS_REF_ACTIVATION_H_
-
-#include "mace/core/op_context.h"
-#include "mace/ops/common/activation_type.h"
-
-namespace mace {
-namespace ops {
-namespace ref {
-
-class Activation {
- public:
-  explicit Activation(ActivationType type,
-                      const float limit,
-                      const float leakyrelu_coefficient);
-  ~Activation() = default;
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      Tensor *output);
-
- private:
-  void DoActivation(const OpContext *context,
-                    const Tensor *input,
-                    Tensor *output);
-
-  ActivationType type_;
-  const float limit_;
-  const float leakyrelu_coefficient_;
-};
-
-}  // namespace ref
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_REF_ACTIVATION_H_
diff --git a/mace/ops/ref/bias_add.cc b/mace/ops/ref/bias_add.cc
index efc56f74b412814da9643eccd9e4ce459299c622..221c2d2e9cc9b00f6157bdedaa276db36fc4dba3 100644
--- a/mace/ops/ref/bias_add.cc
+++ b/mace/ops/ref/bias_add.cc
@@ -12,12 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/ref/bias_add.h"
+#include "mace/ops/delegator/bias_add.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
+class BiasAdd : public delegator::BiasAdd {
+ public:
+  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
+  ~BiasAdd() = default;
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *bias, Tensor *output) override;
+
+ private:
+  void AddBias(const OpContext *context, const Tensor *input,
+               const Tensor *bias, Tensor *output);
+};
+
 MaceStatus BiasAdd::Compute(const OpContext *context,
                             const Tensor *input,
                             const Tensor *bias,
@@ -71,6 +84,9 @@ void BiasAdd::AddBias(const OpContext *context,
   }
 }
 
+MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam,
+                        MACE_DELEGATOR_KEY(BiasAdd, CPU, float, REF))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc
index 1c69ee9d72e98dbb357347ed2d4e10d971e1cb07..d90b7e2bcddb4f2bb8e5997637e4f189eb3c2ba7 100644
--- a/mace/ops/ref/conv_2d.cc
+++ b/mace/ops/ref/conv_2d.cc
@@ -109,6 +109,10 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+typedef Conv2d<float> Conv2dRef;
+MACE_REGISTER_DELEGATOR(registry, Conv2dRef, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, REF, General))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h
index 9a9fbb8f92363fed058d9a96929714c8870ab028..b241a58a179af6c485dc9ed916bb4f1c3dfae401 100644
--- a/mace/ops/ref/conv_2d.h
+++ b/mace/ops/ref/conv_2d.h
@@ -18,64 +18,41 @@
 
 #include <vector>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/conv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
 template<typename OUTPUT_TYPE>
-class Conv2d {
+class Conv2d : public delegator::Conv2d {
  public:
-  Conv2d(const std::vector<int> &strides,
-         const std::vector<int> &dilations,
-         const std::vector<int> &paddings,
-         const Padding padding_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type) {}
+  explicit Conv2d(const delegator::Conv2dParam &param)
+      : delegator::Conv2d(param) {}
   ~Conv2d() {}
   MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
-
- private:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
+      Tensor *output) override;
 };
 
 template<>
-class Conv2d<float> {
+class Conv2d<float> : public delegator::Conv2d {
  public:
-  Conv2d(const std::vector<int> &strides,
-         const std::vector<int> &dilations,
-         const std::vector<int> &paddings,
-         const Padding padding_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type) {}
+  explicit Conv2d(const delegator::Conv2dParam &param)
+      : delegator::Conv2d(param) {}
   ~Conv2d() {}
 
   MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
-
- private:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
+      Tensor *output) override;
 };
 
 }  // namespace ref
diff --git a/mace/ops/ref/deconv_2d.cc b/mace/ops/ref/deconv_2d.cc
index d06c6634548dfb079f615f01f9e394950a214059..d19a96d273cb99096d3d0bf4877d558b4edff780 100644
--- a/mace/ops/ref/deconv_2d.cc
+++ b/mace/ops/ref/deconv_2d.cc
@@ -162,6 +162,11 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+typedef Deconv2d<float> Deconv2dRef;
+MACE_REGISTER_DELEGATOR(
+    registry, Deconv2dRef, delegator::Deconv2dParam,
+    MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, REF, General))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/deconv_2d.h b/mace/ops/ref/deconv_2d.h
index a8ab6722b47037f2552faaea8d8cca5151f463ae..564ce7e7afdac1412ef2ddce8a20e2286ab7b3b0 100644
--- a/mace/ops/ref/deconv_2d.h
+++ b/mace/ops/ref/deconv_2d.h
@@ -18,28 +18,21 @@
 
 #include <vector>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/deconv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
 template<typename OUTPUT_TYPE>
-class Deconv2d {
+class Deconv2d : public delegator::Deconv2d {
  public:
-  Deconv2d(const std::vector<int> &strides,
-           const std::vector<int> &dilations,
-           const std::vector<int> &paddings,
-           const Padding padding_type,
-           const FrameworkType framework_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type),
-        framework_type_(framework_type) {}
+  explicit Deconv2d(const delegator::Deconv2dParam &param)
+      : delegator::Deconv2d(param) {}
 
   ~Deconv2d() = default;
 
@@ -48,29 +41,14 @@ class Deconv2d {
       const Tensor *input,
       const Tensor *filter,
       const Tensor *output_shape,
-      Tensor *output);
-
- private:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
-  const FrameworkType framework_type_;
+      Tensor *output) override;
 };
 
 template<>
-class Deconv2d<float> {
+class Deconv2d<float> : public delegator::Deconv2d {
  public:
-  Deconv2d(const std::vector<int> &strides,
-           const std::vector<int> &dilations,
-           const std::vector<int> &paddings,
-           const Padding padding_type,
-           const FrameworkType framework_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type),
-        framework_type_(framework_type) {}
+  explicit Deconv2d(const delegator::Deconv2dParam &param)
+      : delegator::Deconv2d(param) {}
 
   ~Deconv2d() = default;
 
@@ -79,14 +57,7 @@ class Deconv2d<float> {
       const Tensor *input,
       const Tensor *filter,
       const Tensor *output_shape,
-      Tensor *output);
-
- private:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
-  const FrameworkType framework_type_;
+      Tensor *output) override;
 };
 
 }  // namespace ref
diff --git a/mace/ops/ref/depthwise_conv_2d.cc b/mace/ops/ref/depthwise_conv_2d.cc
index bff950690d719103c31f4ddeb36a7cd934e256c3..03be506ce1e7ea36cb6a763db83c4f50bb0f1e0b 100644
--- a/mace/ops/ref/depthwise_conv_2d.cc
+++ b/mace/ops/ref/depthwise_conv_2d.cc
@@ -115,6 +115,11 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+typedef DepthwiseConv2d<float> DepthwiseConv2dRef;
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseConv2dRef, delegator::DepthwiseConv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, REF, General))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/depthwise_conv_2d.h b/mace/ops/ref/depthwise_conv_2d.h
index 91a95192a43ba2cc97bc9cc08b9774e2fc6d0a8a..cc5a14ca433b62e9e50973e511551beab5dd5160 100644
--- a/mace/ops/ref/depthwise_conv_2d.h
+++ b/mace/ops/ref/depthwise_conv_2d.h
@@ -18,64 +18,41 @@
 
 #include <vector>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_conv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
 template<typename OUTPUT_TYPE>
-class DepthwiseConv2d {
+class DepthwiseConv2d : public delegator::DepthwiseConv2d {
  public:
-  DepthwiseConv2d(const std::vector<int> &strides,
-                  const std::vector<int> &dilations,
-                  const std::vector<int> &paddings,
-                  const Padding padding_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type) {}
+  explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
+      : delegator::DepthwiseConv2d(param) {}
   ~DepthwiseConv2d() {}
   MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
-
- private:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
+      Tensor *output) override;
 };
 
 template<>
-class DepthwiseConv2d<float> {
+class DepthwiseConv2d<float> : public delegator::DepthwiseConv2d {
  public:
-  DepthwiseConv2d(const std::vector<int> &strides,
-                  const std::vector<int> &dilations,
-                  const std::vector<int> &paddings,
-                  const Padding padding_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type) {}
+  explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
+      : delegator::DepthwiseConv2d(param) {}
   ~DepthwiseConv2d() {}
 
   MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
-
- private:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
+      Tensor *output) override;
 };
 
 }  // namespace ref
diff --git a/mace/ops/ref/depthwise_deconv_2d.cc b/mace/ops/ref/depthwise_deconv_2d.cc
index 63b3aa6959ef343ef226a671614626f73578ea53..badded160c49037dc0496a7cccaefe037459a8f0 100644
--- a/mace/ops/ref/depthwise_deconv_2d.cc
+++ b/mace/ops/ref/depthwise_deconv_2d.cc
@@ -302,6 +302,11 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
+typedef DepthwiseDeconv2d<float> DepthwiseDeconv2dRef;
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dRef, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, REF, General))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/depthwise_deconv_2d.h b/mace/ops/ref/depthwise_deconv_2d.h
index 5da7487192a3762e6219716969a826e3f602a85a..586f2627838c30bcb366a850f5b230af980cafca 100644
--- a/mace/ops/ref/depthwise_deconv_2d.h
+++ b/mace/ops/ref/depthwise_deconv_2d.h
@@ -18,63 +18,37 @@
 
 #include <vector>
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
 template<typename OUTPUT_TYPE>
-class GroupDeconv2d {
+class GroupDeconv2d : public delegator::GroupDeconv2d {
  public:
-  GroupDeconv2d(const std::vector<int> &strides,
-                const std::vector<int> &dilations,
-                const std::vector<int> &paddings,
-                const Padding padding_type,
-                const index_t group,
-                const FrameworkType framework_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type),
-        group_(group),
-        framework_type_(framework_type) {}
+  explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
+      : delegator::GroupDeconv2d(param) {}
 
   virtual ~GroupDeconv2d() = default;
 
-  virtual MaceStatus Compute(
+  MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *output_shape,
-      Tensor *output);
-
- private:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
-  const index_t group_;
-  const FrameworkType framework_type_;
+      Tensor *output) override;
 };
 
 template<typename OUTPUT_TYPE>
 class DepthwiseDeconv2d : public GroupDeconv2d<OUTPUT_TYPE> {
  public:
-  DepthwiseDeconv2d(const std::vector<int> &strides,
-                    const std::vector<int> &dilations,
-                    const std::vector<int> &paddings,
-                    const Padding padding_type,
-                    const FrameworkType framework_type)
-      : GroupDeconv2d<OUTPUT_TYPE>(strides,
-                                   dilations,
-                                   paddings,
-                                   padding_type,
-                                   0,
-                                   framework_type) {}
+  explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2d &param)
+      : GroupDeconv2d<OUTPUT_TYPE>(param) {}
 
   ~DepthwiseDeconv2d() = default;
 
@@ -83,57 +57,30 @@ class DepthwiseDeconv2d : public GroupDeconv2d<OUTPUT_TYPE> {
       const Tensor *input,
       const Tensor *filter,
       const Tensor *output_shape,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 template<>
-class GroupDeconv2d<float> {
+class GroupDeconv2d<float> : public delegator::GroupDeconv2d {
  public:
-  GroupDeconv2d(const std::vector<int> &strides,
-                const std::vector<int> &dilations,
-                const std::vector<int> &paddings,
-                const Padding padding_type,
-                const index_t group,
-                const FrameworkType framework_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type),
-        group_(group),
-        framework_type_(framework_type) {}
+  explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
+      : delegator::GroupDeconv2d(param) {}
 
   virtual ~GroupDeconv2d() = default;
 
-  virtual MaceStatus Compute(
+  MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *output_shape,
-      Tensor *output);
-
- protected:
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
-  const index_t group_;
-  const FrameworkType framework_type_;
+      Tensor *output) override;
 };
 
 template<>
 class DepthwiseDeconv2d<float> : public GroupDeconv2d<float> {
  public:
-  DepthwiseDeconv2d(const std::vector<int> &strides,
-                    const std::vector<int> &dilations,
-                    const std::vector<int> &paddings,
-                    const Padding padding_type,
-                    const FrameworkType framework_type)
-      : GroupDeconv2d<float>(strides,
-                             dilations,
-                             paddings,
-                             padding_type,
-                             0,
-                             framework_type) {}
+  explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2dParam &param)
+      : GroupDeconv2d(param) {}
 
   ~DepthwiseDeconv2d() = default;
 
@@ -142,7 +89,7 @@ class DepthwiseDeconv2d<float> : public GroupDeconv2d<float> {
       const Tensor *input,
       const Tensor *filter,
       const Tensor *output_shape,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace ref
diff --git a/mace/ops/ref/gemm.cc b/mace/ops/ref/gemm.cc
index e9d13c91bd9cb2b67eff1d997c94ba5bd4dba8b3..956a7affbf22904b2ab6a023c5ed2756660fe765 100644
--- a/mace/ops/ref/gemm.cc
+++ b/mace/ops/ref/gemm.cc
@@ -111,6 +111,10 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
                  output);
 }
 
+typedef Gemm<float> GemmRef;
+MACE_REGISTER_DELEGATOR(registry, GemmRef, delegator::GemmParam,
+                        MACE_DELEGATOR_KEY(Gemm, CPU, float, REF))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/gemm.h b/mace/ops/ref/gemm.h
index bf1826ada55243e0abcba28eb9d0ca907fc87c45..b7b63fba856d862542f1afe4315990933c3271d2 100644
--- a/mace/ops/ref/gemm.h
+++ b/mace/ops/ref/gemm.h
@@ -16,19 +16,20 @@
 #ifndef MACE_OPS_REF_GEMM_H_
 #define MACE_OPS_REF_GEMM_H_
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/common/matrix.h"
+#include "mace/ops/delegator/gemm.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
 template<typename OUTPUT_TYPE>
-class Gemm {
+class Gemm : public delegator::Gemm {
  public:
-  Gemm() {}
+  explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
   ~Gemm() {}
   MaceStatus Compute(const OpContext *context,
                      const Tensor *lhs,
@@ -42,13 +43,13 @@ class Gemm {
                      const MatrixMajor output_major,
                      const bool lhs_batched,
                      const bool rhs_batched,
-                     Tensor *output);
+                     Tensor *output) override;
 };
 
 template<>
-class Gemm<float> {
+class Gemm<float> : public delegator::Gemm {
  public:
-  Gemm() {}
+  explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
   ~Gemm() {}
   MaceStatus Compute(const OpContext *context,
                      const Tensor *lhs,
@@ -62,7 +63,7 @@ class Gemm<float> {
                      const MatrixMajor output_major,
                      const bool lhs_batched,
                      const bool rhs_batched,
-                     Tensor *output);
+                     Tensor *output) override;
   // Original matrix before transpose has row-major
   MaceStatus Compute(
       const OpContext *context,
@@ -78,7 +79,7 @@ class Gemm<float> {
       const bool transpose_out,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace ref
diff --git a/mace/ops/ref/gemv.cc b/mace/ops/ref/gemv.cc
index bf0366f3ce8cab2c848172b511cdfb98d1cb9d27..350412c2f548b67d737bcffc924c36582866d05f 100644
--- a/mace/ops/ref/gemv.cc
+++ b/mace/ops/ref/gemv.cc
@@ -159,8 +159,16 @@ MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
   }   // b
   return MaceStatus::MACE_SUCCESS;
 }
+
+typedef Gemv<uint8_t> GemvUint8Ref;
+MACE_REGISTER_DELEGATOR(registry, GemvUint8Ref, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, Ref))
 #endif  // MACE_ENABLE_QUANTIZE
 
+typedef Gemv<float> GemvRef;
+MACE_REGISTER_DELEGATOR(registry, GemvRef, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, float, REF))
+
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ref/gemv.h b/mace/ops/ref/gemv.h
index 7116b8fa81b214f6b3405aacc7ea18a18e449cf6..e14730bbd9556e0f14356c88e8276fcebd3ae5ec 100644
--- a/mace/ops/ref/gemv.h
+++ b/mace/ops/ref/gemv.h
@@ -16,18 +16,19 @@
 #ifndef MACE_OPS_REF_GEMV_H_
 #define MACE_OPS_REF_GEMV_H_
 
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
+#include "mace/ops/delegator/gemv.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace ref {
 
 template<typename OUTPUT_TYPE>
-class Gemv {
+class Gemv : public delegator::Gemv {
  public:
-  Gemv() {}
+  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
   ~Gemv() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -40,13 +41,13 @@ class Gemv {
     const index_t lhs_width,
     const bool lhs_batched,
     const bool rhs_batched,
-    Tensor *output);
+    Tensor *output) override;
 };
 
 template<>
-class Gemv<float> {
+class Gemv<float> : public delegator::Gemv {
  public:
-  Gemv() {}
+  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
   ~Gemv() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -59,14 +60,14 @@ class Gemv<float> {
     const index_t lhs_width,
     const bool lhs_batched,
     const bool rhs_batched,
-    Tensor *output);
+    Tensor *output) override;
 };
 
 #if defined(MACE_ENABLE_QUANTIZE)
 template<>
-class Gemv<uint8_t> {
+class Gemv<uint8_t> : public delegator::Gemv {
  public:
-  Gemv() {}
+  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
   ~Gemv() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -79,13 +80,13 @@ class Gemv<uint8_t> {
       const index_t lhs_width,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 template<>
-class Gemv<int32_t> {
+class Gemv<int32_t> : public delegator::Gemv {
  public:
-  Gemv() {}
+  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
   ~Gemv() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -98,7 +99,7 @@ class Gemv<int32_t> {
       const index_t lhs_width,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 };
 #endif  // MACE_ENABLE_QUANTIZE
 
diff --git a/mace/ops/ref/q8/eltwise.cc b/mace/ops/ref/q8/eltwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..220378e4e0e1fdf52d091abf4d974f92edb57eec
--- /dev/null
+++ b/mace/ops/ref/q8/eltwise.cc
@@ -0,0 +1,116 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include <algorithm>
+
+#include "mace/ops/common/gemmlowp_util.h"
+#include "mace/ops/delegator/eltwise.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+namespace q8 {
+
+class Eltwise : public delegator::Eltwise {
+ public:
+  explicit Eltwise(const delegator::EltwiseParam &param)
+      : delegator::Eltwise(param) {}
+  ~Eltwise() = default;
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input0,
+                     const Tensor *input1, Tensor *output) override;
+};
+
+MaceStatus Eltwise::Compute(const OpContext *context,
+                            const Tensor *input0,
+                            const Tensor *input1,
+                            Tensor *output) {
+  constexpr int left_shift = 20;
+  const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
+  const double adjusted_input0_scale = input0->scale() / doubled_scale;
+  const double adjusted_input1_scale = input1->scale() / doubled_scale;
+  const double adjusted_output_scale =
+      doubled_scale / ((1 << left_shift) * output->scale());
+
+  int32_t input0_multiplier;
+  int32_t input1_multiplier;
+  int32_t output_multiplier;
+  int32_t input0_shift;
+  int32_t input1_shift;
+  int32_t output_shift;
+  QuantizeMultiplier(adjusted_input0_scale,
+                     &input0_multiplier,
+                     &input0_shift);
+  QuantizeMultiplier(adjusted_input1_scale,
+                     &input1_multiplier,
+                     &input1_shift);
+  QuantizeMultiplier(adjusted_output_scale,
+                     &output_multiplier,
+                     &output_shift);
+
+  Tensor::MappingGuard input0_guard(input0);
+  Tensor::MappingGuard input1_guard(input1);
+  Tensor::MappingGuard output_guard(output);
+
+  auto input0_ptr = input0->data<uint8_t>();
+  auto input1_ptr = input1->data<uint8_t>();
+  auto output_ptr = output->mutable_data<uint8_t>();
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t i = start; i < end; i += step) {
+      const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
+      const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
+      const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
+      const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
+      const int32_t multiplied_input0 =
+          gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
+                                                          input0_multiplier),
+              -input0_shift);
+      const int32_t multiplied_input1 =
+          gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
+                                                          input1_multiplier),
+              -input1_shift);
+
+      int32_t res;
+      if (type_ == SUM) {
+        res = multiplied_input0 + multiplied_input1;
+      } else {
+        res = multiplied_input0 - multiplied_input1;
+      }
+
+      const int32_t output_val =
+          gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(res,
+                                                          output_multiplier),
+              -output_shift) + output->zero_point();
+      output_ptr[i] = Saturate<uint8_t>(output_val);
+    }
+  }, 0, output->size(), 1);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam,
+                        MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, REF))
+
+}  // namespace q8
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/registry/op_delegators_registry.cc b/mace/ops/registry/op_delegators_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a596878016b222f1606f39201d18b0a40653485f
--- /dev/null
+++ b/mace/ops/registry/op_delegators_registry.cc
@@ -0,0 +1,170 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/registry/registry.h"
+
+namespace mace {
+namespace ops {
+
+namespace ref {
+extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
+extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dRefDelegator(OpDelegatorRegistry *registry);
+extern void RegisterDeconv2dRefDelegator(OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseConv2dRefDelegator(OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseDeconv2dRefDelegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterGemmRefDelegator(OpDelegatorRegistry *registry);
+extern void RegisterGemvRefDelegator(OpDelegatorRegistry *registry);
+
+#ifdef MACE_ENABLE_QUANTIZE
+namespace q8 {
+extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
+}  // namespace q8
+extern void RegisterGemvUint8RefDelegator(OpDelegatorRegistry *registry);
+#endif  // MACE_ENABLE_QUANTIZE
+}  // namespace ref
+
+#ifdef MACE_ENABLE_NEON
+namespace arm {
+namespace fp32 {
+extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
+extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
+
+extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK1x7S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK7x1S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK1x15S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK15x1S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK5x5S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK7x7S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK7x7S2Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dK7x7S3Delegator(OpDelegatorRegistry *registry);
+extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry);
+
+extern void RegisterDeconv2dK2x2S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDeconv2dK2x2S2Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry);
+
+extern void RegisterDepthwiseConv2dK3x3S1Delegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseConv2dK3x3S2Delegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseDeconv2dK3x3S1Delegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseDeconv2dK3x3S2Delegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterGroupDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterGroupDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseDeconv2dK4x4S1Delegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseDeconv2dK4x4S2Delegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterGroupDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
+extern void RegisterGroupDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
+extern void RegisterDepthwiseDeconv2dGeneralDelegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterGroupDeconv2dGeneralDelegator(
+    OpDelegatorRegistry *registry);
+
+extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
+extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
+}  // namespace fp32
+
+#ifdef MACE_ENABLE_QUANTIZE
+namespace q8 {
+extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
+extern void RegisterGemvUint8Delegator(OpDelegatorRegistry *registry);
+extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry);
+}  // namespace q8
+#endif  // MACE_ENABLE_QUANTIZE
+
+}  // namespace arm
+#endif  // MACE_ENABLE_NEON
+
+void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
+  ref::RegisterActivationDelegator(registry);
+  ref::RegisterBiasAddDelegator(registry);
+  ref::RegisterConv2dRefDelegator(registry);
+  ref::RegisterDeconv2dRefDelegator(registry);
+  ref::RegisterDepthwiseConv2dRefDelegator(registry);
+  ref::RegisterDepthwiseDeconv2dRefDelegator(registry);
+  ref::RegisterGemmRefDelegator(registry);
+  ref::RegisterGemvRefDelegator(registry);
+
+#ifdef MACE_ENABLE_QUANTIZE
+  ref::q8::RegisterEltwiseDelegator(registry);
+  ref::RegisterGemvUint8RefDelegator(registry);
+#endif  // MACE_ENABLE_QUANTIZE
+
+#ifdef MACE_ENABLE_NEON
+  arm::fp32::RegisterActivationDelegator(registry);
+  arm::fp32::RegisterBiasAddDelegator(registry);
+
+  arm::fp32::RegisterConv2dK1x1Delegator(registry);
+  arm::fp32::RegisterConv2dK1x7S1Delegator(registry);
+  arm::fp32::RegisterConv2dK7x1S1Delegator(registry);
+  arm::fp32::RegisterConv2dK1x15S1Delegator(registry);
+  arm::fp32::RegisterConv2dK15x1S1Delegator(registry);
+  arm::fp32::RegisterConv2dK3x3S1Delegator(registry);
+  arm::fp32::RegisterConv2dK3x3S2Delegator(registry);
+  arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
+  arm::fp32::RegisterConv2dK5x5S1Delegator(registry);
+  arm::fp32::RegisterConv2dK7x7S1Delegator(registry);
+  arm::fp32::RegisterConv2dK7x7S2Delegator(registry);
+  arm::fp32::RegisterConv2dK7x7S3Delegator(registry);
+  arm::fp32::RegisterConv2dGeneralDelegator(registry);
+
+  arm::fp32::RegisterDeconv2dK2x2S1Delegator(registry);
+  arm::fp32::RegisterDeconv2dK2x2S2Delegator(registry);
+  arm::fp32::RegisterDeconv2dK3x3S1Delegator(registry);
+  arm::fp32::RegisterDeconv2dK3x3S2Delegator(registry);
+  arm::fp32::RegisterDeconv2dK4x4S1Delegator(registry);
+  arm::fp32::RegisterDeconv2dK4x4S2Delegator(registry);
+  arm::fp32::RegisterDeconv2dGeneralDelegator(registry);
+
+  arm::fp32::RegisterDepthwiseConv2dK3x3S1Delegator(registry);
+  arm::fp32::RegisterDepthwiseConv2dK3x3S2Delegator(registry);
+  arm::fp32::RegisterDepthwiseDeconv2dK3x3S1Delegator(registry);
+  arm::fp32::RegisterDepthwiseDeconv2dK3x3S2Delegator(registry);
+  arm::fp32::RegisterGroupDeconv2dK3x3S1Delegator(registry);
+  arm::fp32::RegisterGroupDeconv2dK3x3S2Delegator(registry);
+  arm::fp32::RegisterDepthwiseDeconv2dK4x4S1Delegator(registry);
+  arm::fp32::RegisterDepthwiseDeconv2dK4x4S2Delegator(registry);
+  arm::fp32::RegisterGroupDeconv2dK4x4S1Delegator(registry);
+  arm::fp32::RegisterGroupDeconv2dK4x4S2Delegator(registry);
+  arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
+  arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry);
+
+  arm::fp32::RegisterGemmDelegator(registry);
+  arm::fp32::RegisterGemvDelegator(registry);
+
+#ifdef MACE_ENABLE_QUANTIZE
+  arm::q8::RegisterEltwiseDelegator(registry);
+  arm::q8::RegisterGemvUint8Delegator(registry);
+  arm::q8::RegisterGemvInt32Delegator(registry);
+#endif  // MACE_ENABLE_QUANTIZE
+
+#endif  // MACE_ENABLE_NEON
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/registry/ops_registry.cc b/mace/ops/registry/ops_registry.cc
index eafa78ceb876549fff28cd2eb48df719ff3a17e9..2f6e8c73e8d424d709ca1dcab43981daf3c0a151 100644
--- a/mace/ops/registry/ops_registry.cc
+++ b/mace/ops/registry/ops_registry.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,167 +12,167 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/registry/ops_registry.h"
+#include "mace/ops/registry/registry.h"
 
 namespace mace {
 
 namespace ops {
 // Keep in lexicographical order
-extern void RegisterActivation(OpRegistryBase *op_registry);
-extern void RegisterAddN(OpRegistryBase *op_registry);
-extern void RegisterArgMax(OpRegistryBase *op_registry);
-extern void RegisterBatchNorm(OpRegistryBase *op_registry);
-extern void RegisterBatchToSpaceND(OpRegistryBase *op_registry);
-extern void RegisterBiasAdd(OpRegistryBase *op_registry);
-extern void RegisterCast(OpRegistryBase *op_registry);
-extern void RegisterChannelShuffle(OpRegistryBase *op_registry);
-extern void RegisterConcat(OpRegistryBase *op_registry);
-extern void RegisterConv2D(OpRegistryBase *op_registry);
-extern void RegisterCrop(OpRegistryBase *op_registry);
-extern void RegisterCumsum(OpRegistryBase *op_registry);
-extern void RegisterDeconv2D(OpRegistryBase *op_registry);
-extern void RegisterDepthToSpace(OpRegistryBase *op_registry);
-extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry);
-extern void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry);
-extern void RegisterDynamicLSTM(OpRegistryBase *op_registry);
-extern void RegisterEltwise(OpRegistryBase *op_registry);
-extern void RegisterExpandDims(OpRegistryBase *op_registry);
-extern void RegisterExtractPooling(OpRegistryBase *op_registry);
-extern void RegisterFill(OpRegistryBase *op_registry);
-extern void RegisterFullyConnected(OpRegistryBase *op_registry);
-extern void RegisterGather(OpRegistryBase *op_registry);
-extern void RegisterIdentity(OpRegistryBase *op_registry);
-extern void RegisterIfDefined(OpRegistryBase *op_registry);
-extern void RegisterInferConv2dShape(OpRegistryBase *op_registry);
-extern void RegisterKaldiBatchNorm(OpRegistryBase *op_registry);
-extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry);
-extern void RegisterLpNorm(OpRegistryBase *op_registry);
-extern void RegisterLSTMNonlinear(OpRegistryBase *op_registry);
-extern void RegisterMatMul(OpRegistryBase *op_registry);
-extern void RegisterMVNorm(OpRegistryBase *op_registry);
-extern void RegisterOneHot(OpRegistryBase *op_registry);
-extern void RegisterPad(OpRegistryBase *op_registry);
-extern void RegisterPadContext(OpRegistryBase *op_registry);
-extern void RegisterPNorm(OpRegistryBase *op_registry);
-extern void RegisterPooling(OpRegistryBase *op_registry);
-extern void RegisterReduce(OpRegistryBase *op_registry);
-extern void RegisterReplaceIndex(OpRegistryBase *op_registry);
-extern void RegisterPriorBox(OpRegistryBase *op_registry);
-extern void RegisterReshape(OpRegistryBase *op_registry);
-extern void RegisterResizeBicubic(OpRegistryBase *op_registry);
-extern void RegisterResizeBilinear(OpRegistryBase *op_registry);
-extern void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry);
-extern void RegisterReverse(OpRegistryBase *op_registry);
-extern void RegisterScalarMath(OpRegistryBase *op_registry);
-extern void RegisterSelect(OpRegistryBase *op_registry);
-extern void RegisterShape(OpRegistryBase *op_registry);
-extern void RegisterSlice(OpRegistryBase *op_registry);
-extern void RegisterSoftmax(OpRegistryBase *op_registry);
-extern void RegisterSpaceToBatchND(OpRegistryBase *op_registry);
-extern void RegisterSpaceToDepth(OpRegistryBase *op_registry);
-extern void RegisterSplice(OpRegistryBase *op_registry);
-extern void RegisterSplit(OpRegistryBase *op_registry);
-extern void RegisterSqrDiffMean(OpRegistryBase *op_registry);
-extern void RegisterSqueeze(OpRegistryBase *op_registry);
-extern void RegisterStack(OpRegistryBase *op_registry);
-extern void RegisterStridedSlice(OpRegistryBase *op_registry);
-extern void RegisterSubsample(OpRegistryBase *op_registry);
-extern void RegisterSumGroup(OpRegistryBase *op_registry);
-extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry);
-extern void RegisterTile(OpRegistryBase *op_registry);
-extern void RegisterTranspose(OpRegistryBase *op_registry);
-extern void RegisterUnstack(OpRegistryBase *op_registry);
-extern void RegisterUnsqueeze(OpRegistryBase *op_registry);
+extern void RegisterActivation(OpRegistry *op_registry);
+extern void RegisterAddN(OpRegistry *op_registry);
+extern void RegisterArgMax(OpRegistry *op_registry);
+extern void RegisterBatchNorm(OpRegistry *op_registry);
+extern void RegisterBatchToSpaceND(OpRegistry *op_registry);
+extern void RegisterBiasAdd(OpRegistry *op_registry);
+extern void RegisterCast(OpRegistry *op_registry);
+extern void RegisterChannelShuffle(OpRegistry *op_registry);
+extern void RegisterConcat(OpRegistry *op_registry);
+extern void RegisterConv2D(OpRegistry *op_registry);
+extern void RegisterCrop(OpRegistry *op_registry);
+extern void RegisterCumsum(OpRegistry *op_registry);
+extern void RegisterDeconv2D(OpRegistry *op_registry);
+extern void RegisterDepthToSpace(OpRegistry *op_registry);
+extern void RegisterDepthwiseConv2d(OpRegistry *op_registry);
+extern void RegisterDepthwiseDeconv2d(OpRegistry *op_registry);
+extern void RegisterDynamicLSTM(OpRegistry *op_registry);
+extern void RegisterEltwise(OpRegistry *op_registry);
+extern void RegisterExpandDims(OpRegistry *op_registry);
+extern void RegisterExtractPooling(OpRegistry *op_registry);
+extern void RegisterFill(OpRegistry *op_registry);
+extern void RegisterFullyConnected(OpRegistry *op_registry);
+extern void RegisterGather(OpRegistry *op_registry);
+extern void RegisterIdentity(OpRegistry *op_registry);
+extern void RegisterIfDefined(OpRegistry *op_registry);
+extern void RegisterInferConv2dShape(OpRegistry *op_registry);
+extern void RegisterKaldiBatchNorm(OpRegistry *op_registry);
+extern void RegisterLocalResponseNorm(OpRegistry *op_registry);
+extern void RegisterLpNorm(OpRegistry *op_registry);
+extern void RegisterLSTMNonlinear(OpRegistry *op_registry);
+extern void RegisterMatMul(OpRegistry *op_registry);
+extern void RegisterMVNorm(OpRegistry *op_registry);
+extern void RegisterOneHot(OpRegistry *op_registry);
+extern void RegisterPad(OpRegistry *op_registry);
+extern void RegisterPadContext(OpRegistry *op_registry);
+extern void RegisterPNorm(OpRegistry *op_registry);
+extern void RegisterPooling(OpRegistry *op_registry);
+extern void RegisterReduce(OpRegistry *op_registry);
+extern void RegisterReplaceIndex(OpRegistry *op_registry);
+extern void RegisterPriorBox(OpRegistry *op_registry);
+extern void RegisterReshape(OpRegistry *op_registry);
+extern void RegisterResizeBicubic(OpRegistry *op_registry);
+extern void RegisterResizeBilinear(OpRegistry *op_registry);
+extern void RegisterResizeNearestNeighbor(OpRegistry *op_registry);
+extern void RegisterReverse(OpRegistry *op_registry);
+extern void RegisterScalarMath(OpRegistry *op_registry);
+extern void RegisterSelect(OpRegistry *op_registry);
+extern void RegisterShape(OpRegistry *op_registry);
+extern void RegisterSlice(OpRegistry *op_registry);
+extern void RegisterSoftmax(OpRegistry *op_registry);
+extern void RegisterSpaceToBatchND(OpRegistry *op_registry);
+extern void RegisterSpaceToDepth(OpRegistry *op_registry);
+extern void RegisterSplice(OpRegistry *op_registry);
+extern void RegisterSplit(OpRegistry *op_registry);
+extern void RegisterSqrDiffMean(OpRegistry *op_registry);
+extern void RegisterSqueeze(OpRegistry *op_registry);
+extern void RegisterStack(OpRegistry *op_registry);
+extern void RegisterStridedSlice(OpRegistry *op_registry);
+extern void RegisterSubsample(OpRegistry *op_registry);
+extern void RegisterSumGroup(OpRegistry *op_registry);
+extern void RegisterTargetRMSNorm(OpRegistry *op_registry);
+extern void RegisterTile(OpRegistry *op_registry);
+extern void RegisterTranspose(OpRegistry *op_registry);
+extern void RegisterUnstack(OpRegistry *op_registry);
+extern void RegisterUnsqueeze(OpRegistry *op_registry);
 
 #ifdef MACE_ENABLE_QUANTIZE
-extern void RegisterDequantize(OpRegistryBase *op_registry);
-extern void RegisterQuantize(OpRegistryBase *op_registry);
+extern void RegisterDequantize(OpRegistry *op_registry);
+extern void RegisterQuantize(OpRegistry *op_registry);
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-extern void RegisterBufferTransform(OpRegistryBase *op_registry);
-extern void RegisterLSTMCell(OpRegistryBase *op_registry);
+extern void RegisterBufferTransform(OpRegistry *op_registry);
+extern void RegisterLSTMCell(OpRegistry *op_registry);
 #endif  // MACE_ENABLE_OPENCL
-}  // namespace ops
 
 
-OpRegistry::OpRegistry() : OpRegistryBase() {
+void RegisterAllOps(OpRegistry *registry) {
   // Keep in lexicographical order
-  ops::RegisterActivation(this);
-  ops::RegisterAddN(this);
-  ops::RegisterArgMax(this);
-  ops::RegisterBatchNorm(this);
-  ops::RegisterBatchToSpaceND(this);
-  ops::RegisterBiasAdd(this);
-  ops::RegisterCast(this);
-  ops::RegisterChannelShuffle(this);
-  ops::RegisterConcat(this);
-  ops::RegisterConv2D(this);
-  ops::RegisterCrop(this);
-  ops::RegisterCumsum(this);
-  ops::RegisterDeconv2D(this);
-  ops::RegisterDepthToSpace(this);
-  ops::RegisterDepthwiseConv2d(this);
-  ops::RegisterDepthwiseDeconv2d(this);
-  ops::RegisterDynamicLSTM(this);
-  ops::RegisterEltwise(this);
-  ops::RegisterExpandDims(this);
-  ops::RegisterExtractPooling(this);
-  ops::RegisterFill(this);
-  ops::RegisterFullyConnected(this);
-  ops::RegisterGather(this);
-  ops::RegisterIdentity(this);
-  ops::RegisterIfDefined(this);
-  ops::RegisterInferConv2dShape(this);
-  ops::RegisterKaldiBatchNorm(this);
-  ops::RegisterLocalResponseNorm(this);
-  ops::RegisterLpNorm(this);
-  ops::RegisterLSTMNonlinear(this);
-  ops::RegisterMatMul(this);
-  ops::RegisterMVNorm(this);
-  ops::RegisterOneHot(this);
-  ops::RegisterPad(this);
-  ops::RegisterPadContext(this);
-  ops::RegisterPNorm(this);
-  ops::RegisterPooling(this);
-  ops::RegisterReduce(this);
-  ops::RegisterReplaceIndex(this);
-  ops::RegisterPriorBox(this);
-  ops::RegisterReshape(this);
-  ops::RegisterResizeBicubic(this);
-  ops::RegisterResizeBilinear(this);
-  ops::RegisterResizeNearestNeighbor(this);
-  ops::RegisterReverse(this);
-  ops::RegisterScalarMath(this);
-  ops::RegisterSelect(this);
-  ops::RegisterShape(this);
-  ops::RegisterSlice(this);
-  ops::RegisterSoftmax(this);
-  ops::RegisterSpaceToBatchND(this);
-  ops::RegisterSpaceToDepth(this);
-  ops::RegisterSplice(this);
-  ops::RegisterSplit(this);
-  ops::RegisterStack(this);
-  ops::RegisterStridedSlice(this);
-  ops::RegisterSqrDiffMean(this);
-  ops::RegisterSqueeze(this);
-  ops::RegisterSubsample(this);
-  ops::RegisterSumGroup(this);
-  ops::RegisterTargetRMSNorm(this);
-  ops::RegisterTile(this);
-  ops::RegisterTranspose(this);
-  ops::RegisterUnstack(this);
-  ops::RegisterUnsqueeze(this);
+  ops::RegisterActivation(registry);
+  ops::RegisterAddN(registry);
+  ops::RegisterArgMax(registry);
+  ops::RegisterBatchNorm(registry);
+  ops::RegisterBatchToSpaceND(registry);
+  ops::RegisterBiasAdd(registry);
+  ops::RegisterCast(registry);
+  ops::RegisterChannelShuffle(registry);
+  ops::RegisterConcat(registry);
+  ops::RegisterConv2D(registry);
+  ops::RegisterCrop(registry);
+  ops::RegisterCumsum(registry);
+  ops::RegisterDeconv2D(registry);
+  ops::RegisterDepthToSpace(registry);
+  ops::RegisterDepthwiseConv2d(registry);
+  ops::RegisterDepthwiseDeconv2d(registry);
+  ops::RegisterDynamicLSTM(registry);
+  ops::RegisterEltwise(registry);
+  ops::RegisterExpandDims(registry);
+  ops::RegisterExtractPooling(registry);
+  ops::RegisterFill(registry);
+  ops::RegisterFullyConnected(registry);
+  ops::RegisterGather(registry);
+  ops::RegisterIdentity(registry);
+  ops::RegisterIfDefined(registry);
+  ops::RegisterInferConv2dShape(registry);
+  ops::RegisterKaldiBatchNorm(registry);
+  ops::RegisterLocalResponseNorm(registry);
+  ops::RegisterLpNorm(registry);
+  ops::RegisterLSTMNonlinear(registry);
+  ops::RegisterMatMul(registry);
+  ops::RegisterMVNorm(registry);
+  ops::RegisterOneHot(registry);
+  ops::RegisterPad(registry);
+  ops::RegisterPadContext(registry);
+  ops::RegisterPNorm(registry);
+  ops::RegisterPooling(registry);
+  ops::RegisterReduce(registry);
+  ops::RegisterReplaceIndex(registry);
+  ops::RegisterPriorBox(registry);
+  ops::RegisterReshape(registry);
+  ops::RegisterResizeBicubic(registry);
+  ops::RegisterResizeBilinear(registry);
+  ops::RegisterResizeNearestNeighbor(registry);
+  ops::RegisterReverse(registry);
+  ops::RegisterScalarMath(registry);
+  ops::RegisterSelect(registry);
+  ops::RegisterShape(registry);
+  ops::RegisterSlice(registry);
+  ops::RegisterSoftmax(registry);
+  ops::RegisterSpaceToBatchND(registry);
+  ops::RegisterSpaceToDepth(registry);
+  ops::RegisterSplice(registry);
+  ops::RegisterSplit(registry);
+  ops::RegisterStack(registry);
+  ops::RegisterStridedSlice(registry);
+  ops::RegisterSqrDiffMean(registry);
+  ops::RegisterSqueeze(registry);
+  ops::RegisterSubsample(registry);
+  ops::RegisterSumGroup(registry);
+  ops::RegisterTargetRMSNorm(registry);
+  ops::RegisterTile(registry);
+  ops::RegisterTranspose(registry);
+  ops::RegisterUnstack(registry);
+  ops::RegisterUnsqueeze(registry);
 
 #ifdef MACE_ENABLE_QUANTIZE
-  ops::RegisterDequantize(this);
-  ops::RegisterQuantize(this);
+  ops::RegisterDequantize(registry);
+  ops::RegisterQuantize(registry);
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-  ops::RegisterBufferTransform(this);
-  ops::RegisterLSTMCell(this);
+  ops::RegisterBufferTransform(registry);
+  ops::RegisterLSTMCell(registry);
 #endif  // MACE_ENABLE_OPENCL
 }
 
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/registry/ops_registry.h b/mace/ops/registry/registry.h
similarity index 68%
rename from mace/ops/registry/ops_registry.h
rename to mace/ops/registry/registry.h
index 01f013dc4c7d334be68b6d42a6a7abaa0c41e7a0..ed8d55f42297ed064450e393c68c8a43ce9f4dd7 100644
--- a/mace/ops/registry/ops_registry.h
+++ b/mace/ops/registry/registry.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_REGISTRY_OPS_REGISTRY_H_
-#define MACE_OPS_REGISTRY_OPS_REGISTRY_H_
-
-#include "mace/core/operator.h"
+#ifndef MACE_OPS_REGISTRY_REGISTRY_H_
+#define MACE_OPS_REGISTRY_REGISTRY_H_
 
 namespace mace {
+class OpRegistry;
+class OpDelegatorRegistry;
+
+namespace ops {
 
-class OpRegistry : public OpRegistryBase {
- public:
-  OpRegistry();
-  ~OpRegistry() = default;
-};
+void RegisterAllOps(OpRegistry *registry);
+void RegisterAllOpDelegators(OpDelegatorRegistry *registry);
 
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_REGISTRY_OPS_REGISTRY_H_
+#endif  // MACE_OPS_REGISTRY_REGISTRY_H_
diff --git a/mace/ops/replace_index.cc b/mace/ops/replace_index.cc
index d4f95323f84b70815ed7850c8593cd8d7f40c4a3..8b2f76db8ad9b133530e010935343f7eadbc7bad 100644
--- a/mace/ops/replace_index.cc
+++ b/mace/ops/replace_index.cc
@@ -20,7 +20,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -94,7 +95,7 @@ class ReplaceIndexOp<DeviceType::CPU, T> : public Operation {
   std::vector<index_t> forward_indexes_;
 };
 
-void RegisterReplaceIndex(OpRegistryBase *op_registry) {
+void RegisterReplaceIndex(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc
index b5daa4301f5ceb036939d04da3767bb685ad9566..63c91c2e3ad0d4035844b4d18ea75f2e3285579d 100644
--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
@@ -14,7 +14,8 @@
 
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -149,7 +150,7 @@ class ReshapeOp<GPU, float> : public Operation {
 };
 #endif
 
-void RegisterReshape(OpRegistryBase *op_registry) {
+void RegisterReshape(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, int32_t);
   MACE_REGISTER_GPU_OP(op_registry, "Reshape", ReshapeOp);
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 5e48ad392e9c46269187b632f5d19c1c058ef081..d5d25eda194c373e6271de01c54db796f18a833e 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -17,7 +17,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bicubic.h"
@@ -232,7 +233,7 @@ class ResizeBicubicOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterResizeBicubic(OpRegistryBase *op_registry) {
+void RegisterResizeBicubic(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index e209864f15f1d18da6e6f96353f68e257252812e..2fa891d1bb39016a5da3aff565d27ab78296c357 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -16,7 +16,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/memory.h"
 #include "mace/core/quantize.h"
 #include "mace/ops/common/utils.h"
@@ -366,7 +367,7 @@ class ResizeBilinearOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterResizeBilinear(OpRegistryBase *op_registry) {
+void RegisterResizeBilinear(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc
index 6ac6b9e7157684805a7faf5a45ce9be169ba2af3..201c4515878ec4872e45e8fb7cc6fb23b53cd43d 100644
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -16,7 +16,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_nearest_neighbor.h"
@@ -172,7 +173,7 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) {
+void RegisterResizeNearestNeighbor(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
                    ResizeNearestNeighborOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/reverse.cc b/mace/ops/reverse.cc
index df3fe6f09ceb2e522c1ec330ba1736076e9e92d6..af9670e34563ab506c15e4c2317091d9ad864e91 100644
--- a/mace/ops/reverse.cc
+++ b/mace/ops/reverse.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -72,7 +73,7 @@ class ReverseOp<DeviceType::CPU, T> : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterReverse(OpRegistryBase *op_registry) {
+void RegisterReverse(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Reverse", ReverseOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc
index 07794065dbf678ccce6fe1c808240ce6508a4df7..1c2734205c0898e5216adeb0c7370ab73f773588 100644
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -16,7 +16,8 @@
 #include <cmath>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/eltwise.h"
 
 namespace mace {
@@ -154,7 +155,7 @@ class ScalarMathOp : public Operation {
   int32_t scalar_input_index_;
 };
 
-void RegisterScalarMath(OpRegistryBase *op_registry) {
+void RegisterScalarMath(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
diff --git a/mace/ops/select.cc b/mace/ops/select.cc
index 4d094e651eea8e0113786ee078d4a3c04c8660e0..5001ba20140fa1634af972dc960776f979ea0753 100644
--- a/mace/ops/select.cc
+++ b/mace/ops/select.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 
 namespace mace {
@@ -204,7 +205,7 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterSelect(OpRegistryBase *op_registry) {
+void RegisterSelect(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Select", SelectOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc
index dcca202f3229f616a3ce89dddcd008cf998a1a69..0071ec258cb260145625505a5a835011e1e65461 100644
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -58,7 +59,7 @@ class ShapeOp : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterShape(OpRegistryBase *op_registry) {
+void RegisterShape(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc
index ac7ca64a9a700412a19a9600afaccdc2e56d81a8..14e71cad6ceb951f0cc6c6d3ba95ef81dd0fcea2 100644
--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
@@ -15,7 +15,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -83,7 +84,7 @@ class SliceOp<DeviceType::CPU, T> : public Operation {
   std::vector<int> ends_;
 };
 
-void RegisterSlice(OpRegistryBase *op_registry) {
+void RegisterSlice(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Slice", SliceOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index cfbde681eaac06aff6a5d84a8d5cc7afc45021b0..bf7cf202c8cffe528bcae1a9064cca8e0d4d967b 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -18,7 +18,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/fixpoint.h"
@@ -520,7 +521,7 @@ class SoftmaxOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterSoftmax(OpRegistryBase *op_registry) {
+void RegisterSoftmax(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index 156c2132289a487cb0db14d0bce05da85a31442d..e5d7ec5ca8ff5d33c215e913b4af4bd96b45cc71 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -15,7 +15,8 @@
 #include <algorithm>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/space_to_batch.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -328,7 +329,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
+void RegisterSpaceToBatchND(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
                    SpaceToBatchNDOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index 59c1a342162d0637f8e2d30b33c9b1835fac61f5..3653d09a9454057f2d2143774f4fa97ecc13167d 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -15,7 +15,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/space_to_depth.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -180,7 +181,7 @@ class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
+void RegisterSpaceToDepth(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "SpaceToDepth",
                    SpaceToDepthOp, DeviceType::CPU, float);
 
diff --git a/mace/ops/splice.cc b/mace/ops/splice.cc
index 8f9198c00079f1c364bbc49b7b7c011cd384dd3d..af1536717ae66c3a1223c5bb7b4f346c7821cfd6 100644
--- a/mace/ops/splice.cc
+++ b/mace/ops/splice.cc
@@ -29,7 +29,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 
 namespace mace {
@@ -153,7 +154,7 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
   std::vector<index_t> forward_const_indexes_;
 };
 
-void RegisterSplice(OpRegistryBase *op_registry) {
+void RegisterSplice(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Splice", SpliceOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index ffe7172f841bb76be8e4428cdf9a30ac29ee27bd..bb86aecbfc872e1d439b2aaa07bbbe93da81af7e 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -15,7 +15,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/split.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -128,7 +129,7 @@ class SplitOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterSplit(OpRegistryBase *op_registry) {
+void RegisterSplit(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Split", SplitOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index 2d85ed98448ba37e60572df7f87c6184ebbeddfb..0e2b8d2bb891eceb5c46836af0e2e9b0bb81af15 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -15,7 +15,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/sqrdiff_mean.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -100,7 +101,7 @@ class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-void RegisterSqrDiffMean(OpRegistryBase *op_registry) {
+void RegisterSqrDiffMean(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
                    DeviceType::CPU, float);
 
diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc
index 0c08cfd589b6d5c5f080432bffb62162706f15bc..590479dd327f382286632bd27458135281e6aec7 100644
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -15,7 +15,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -77,7 +78,7 @@ class SqueezeOp : public SqueezeOpRaw {
   }
 };
 
-void RegisterSqueeze(OpRegistryBase *op_registry) {
+void RegisterSqueeze(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float);
 #ifdef MACE_ENABLE_QUANTIZE
   MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc
index f49c401aebd19af8ca99681e710d8fa704dbf804..87cc51a0c0e89d9d8a6c48d715ce10d32a08061c 100644
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
@@ -15,7 +15,8 @@
 #include <algorithm>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -77,7 +78,7 @@ class StackOp : public Operation {
   int axis_;
 };
 
-void RegisterStack(OpRegistryBase *op_registry) {
+void RegisterStack(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t);
 }
diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc
index 4218d1f78614b487c85d4d645a09495b9c380a6b..bf44d5a162b19b1d813acc5c39ad9a1077622887 100644
--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -16,7 +16,8 @@
 #include <cmath>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 
 namespace mace {
@@ -350,7 +351,7 @@ class StridedSliceOp : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterStridedSlice(OpRegistryBase *op_registry) {
+void RegisterStridedSlice(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
diff --git a/mace/ops/subsample.cc b/mace/ops/subsample.cc
index 11835ac9987df4499d1686d0b03d547a3cbfd336..e3c2977e2e8b7f091c983d510faf1d51dea73a71 100644
--- a/mace/ops/subsample.cc
+++ b/mace/ops/subsample.cc
@@ -18,7 +18,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 
 namespace mace {
@@ -100,7 +101,7 @@ class SubsampleOp<DeviceType::CPU, T> : public Operation {
   std::vector<index_t> forward_indexes_;
 };
 
-void RegisterSubsample(OpRegistryBase *op_registry) {
+void RegisterSubsample(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/sum_group.cc b/mace/ops/sum_group.cc
index 1b62af7e7809c70b1931844e8b606fc322a4a83e..b8524a7480f3c5095e5bbf6d50ec92f3c26240ea 100644
--- a/mace/ops/sum_group.cc
+++ b/mace/ops/sum_group.cc
@@ -20,7 +20,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -101,7 +102,7 @@ class SumGroupOp<DeviceType::CPU, T> : public Operation {
   }
 };
 
-void RegisterSumGroup(OpRegistryBase *op_registry) {
+void RegisterSumGroup(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "SumGroup", SumGroupOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/target_rms_norm.cc b/mace/ops/target_rms_norm.cc
index 23535e15804b476b4b979810f8a3f7663b96b266..e2b2fa2eb72177ae153c1b70f27fb333ebaee1af 100644
--- a/mace/ops/target_rms_norm.cc
+++ b/mace/ops/target_rms_norm.cc
@@ -22,7 +22,8 @@
 #include <functional>
 #include <memory>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -148,7 +149,7 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
   int block_dim_;
 };
 
-void RegisterTargetRMSNorm(OpRegistryBase *op_registry) {
+void RegisterTargetRMSNorm(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/tile.cc b/mace/ops/tile.cc
index 36d0bfe9b826b51763bc62fb0758d1fa7e665f11..c09ca92104706649c525dc4a0bba258d5dbc1f0c 100644
--- a/mace/ops/tile.cc
+++ b/mace/ops/tile.cc
@@ -16,7 +16,8 @@
 #include <memory>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/memory.h"
 
 namespace mace {
@@ -110,7 +111,7 @@ class TileOp : public Operation {
   int has_data_format_;
 };
 
-void RegisterTile(OpRegistryBase *op_registry) {
+void RegisterTile(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Tile", TileOp, DeviceType::CPU, float);
   MACE_REGISTER_OP_CONDITION(
       op_registry, OpConditionBuilder("Tile").SetDevicePlacerFunc(
diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc
index 4eb41e5b7b4a902d6cf930cec2e39b7616853f4c..a366f3d421cec6dbd7172dc25b18bd660165cb12 100644
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
@@ -20,7 +20,8 @@
 #include <cmath>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/common/transpose.h"
 
 namespace mace {
@@ -64,7 +65,7 @@ class TransposeOp<D, float> : public Operation {
   std::vector<int> dims_;
 };
 
-void RegisterTranspose(OpRegistryBase *op_registry) {
+void RegisterTranspose(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Transpose", TransposeOp,
                    DeviceType::CPU, float);
 }
diff --git a/mace/ops/unsqueeze.cc b/mace/ops/unsqueeze.cc
index 9fde2a91b946a0fbe8db29307615cfa0c735f189..cc28c14d8865f4bdcac79f6c5b8974f5530fba52 100644
--- a/mace/ops/unsqueeze.cc
+++ b/mace/ops/unsqueeze.cc
@@ -15,7 +15,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -59,7 +60,7 @@ class UnsqueezeOp : public Operation {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-void RegisterUnsqueeze(OpRegistryBase *op_registry) {
+void RegisterUnsqueeze(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp,
diff --git a/mace/ops/unstack.cc b/mace/ops/unstack.cc
index b2a6eb6cee3b1adff4ecd7a40c3dcabb583e86ba..d0928614293dee689c77b607c57469c933c32b0a 100644
--- a/mace/ops/unstack.cc
+++ b/mace/ops/unstack.cc
@@ -15,7 +15,8 @@
 #include <algorithm>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 
 namespace mace {
 namespace ops {
@@ -73,7 +74,7 @@ class UnstackOp : public Operation {
   int axis_;
 };
 
-void RegisterUnstack(OpRegistryBase *op_registry) {
+void RegisterUnstack(OpRegistry *op_registry) {
   MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
diff --git a/test/ccbenchmark/mace/ops/depthwise_deconv2d_benchmark.cc b/test/ccbenchmark/mace/ops/depthwise_deconv2d_benchmark.cc
index 2ac04e0c0a398e68e1b4cd4bab8b9b78db7a48ae..fc0e7ed71dfceec442360df57d45a8447ea2deb6 100644
--- a/test/ccbenchmark/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -15,7 +15,7 @@
 #include <algorithm>
 
 #include "mace/utils/statistics.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/benchmark_utils/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/test/ccbenchmark/mace/ops/quantize_benchmark.cc b/test/ccbenchmark/mace/ops/quantize_benchmark.cc
index 0923a29310b4483ee9abcd249194b1782213c37a..c43bcacb86489c34af42f505e0b3c2a89511395a 100644
--- a/test/ccbenchmark/mace/ops/quantize_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/quantize_benchmark.cc
@@ -14,7 +14,7 @@
 
 #ifdef MACE_ENABLE_QUANTIZE
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/benchmark_utils/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/test/ccbenchmark/mace/ops/sqrdiff_mean_benchmark.cc b/test/ccbenchmark/mace/ops/sqrdiff_mean_benchmark.cc
index 05eaf21d11c3fb5dd36173f73f9ba1d70a892c62..791182e82eec7b8d9a3a2ceae9496809e872e252 100644
--- a/test/ccbenchmark/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/sqrdiff_mean_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/benchmark_utils/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/test/ccunit/mace/ops/arm/fp32/gemm_test.cc b/test/ccunit/mace/ops/arm/fp32/gemm_test.cc
index 805720331b193895301b40b408b4eac0b384104c..65a516f966326661da8f214de5803fe32e2402b0 100644
--- a/test/ccunit/mace/ops/arm/fp32/gemm_test.cc
+++ b/test/ccunit/mace/ops/arm/fp32/gemm_test.cc
@@ -15,8 +15,8 @@
 
 #include <gtest/gtest.h>
 
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/ref/gemm.h"
 #include "mace/ops/testing/test_utils.h"
@@ -50,7 +50,7 @@ void TestGemmFloat32(const index_t batch,
     GenerateRandomRealTypeData<float>(rhs.shape(), rhs_data);
     GenerateRandomRealTypeData<float>(output.shape(), output_data);
   }
-  ::mace::ops::arm::fp32::Gemm gemm;
+  ::mace::ops::arm::fp32::Gemm gemm((delegator::GemmParam()));
   utils::ThreadPool thread_pool(1, AFFINITY_NONE);
   thread_pool.Init();
   CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
@@ -71,7 +71,7 @@ void TestGemmFloat32(const index_t batch,
 
   Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT);
   expected_output.Resize({batch, rows, cols});
-  ::mace::ops::ref::Gemm<float> gemm_ref;
+  ::mace::ops::ref::Gemm<float> gemm_ref((delegator::GemmParam()));
   gemm_ref.Compute(nullptr,
                    &lhs,
                    &rhs,
diff --git a/test/ccunit/mace/ops/arm/fp32/gemv_test.cc b/test/ccunit/mace/ops/arm/fp32/gemv_test.cc
index bc97bc3ee8ed9c52f62518830cba2b8775973702..3a224ea261c3782ec37336f309fddd9ef539f110 100644
--- a/test/ccunit/mace/ops/arm/fp32/gemv_test.cc
+++ b/test/ccunit/mace/ops/arm/fp32/gemv_test.cc
@@ -15,8 +15,8 @@
 
 #include <gtest/gtest.h>
 
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemv.h"
 #include "mace/ops/ref/gemv.h"
 #include "mace/ops/testing/test_utils.h"
@@ -53,7 +53,8 @@ void TestGemvFloat32(const index_t batch,
   thread_pool.Init();
   CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
   OpContext context(nullptr, &cpu_device);
-  ::mace::ops::arm::fp32::Gemv gemv;
+  ::mace::ops::arm::fp32::Gemv gemv =
+      ::mace::ops::arm::fp32::Gemv(DelegatorParam());
   gemv.Compute(&context,
                &lhs,
                &rhs,
@@ -67,7 +68,8 @@ void TestGemvFloat32(const index_t batch,
 
   Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT);
   expected_output.Resize({batch, height});
-  ::mace::ops::ref::Gemv<float> gemv_ref;
+  ::mace::ops::ref::Gemv<float> gemv_ref =
+      ::mace::ops::ref::Gemv<float>(DelegatorParam());
   gemv_ref.Compute(nullptr,
                    &lhs,
                    &rhs,
diff --git a/test/ccunit/mace/ops/arm/q8/gemv_test.cc b/test/ccunit/mace/ops/arm/q8/gemv_test.cc
index 6216cabaed02bbfc84ebc4b10adc0a012cdece3e..619d343fdd4ccf9ea051b22d0004cb3edc1a5352 100644
--- a/test/ccunit/mace/ops/arm/q8/gemv_test.cc
+++ b/test/ccunit/mace/ops/arm/q8/gemv_test.cc
@@ -15,8 +15,8 @@
 
 #include <gtest/gtest.h>
 
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/q8/gemv.h"
 #include "mace/ops/ref/gemv.h"
 #include "mace/ops/testing/test_utils.h"
@@ -58,7 +58,8 @@ void TestGemvInt32(const index_t batch,
   thread_pool.Init();
   CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
   OpContext context(nullptr, &cpu_device);
-  mace::ops::arm::q8::Gemv<int32_t> gemv;
+  mace::ops::arm::q8::Gemv<int32_t> gemv =
+      mace::ops::arm::q8::Gemv<int32_t>(DelegatorParam());
   gemv.Compute(&context,
                &lhs,
                &rhs,
@@ -72,7 +73,8 @@ void TestGemvInt32(const index_t batch,
 
   Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32);
   expected_output.Resize({batch, height});
-  mace::ops::ref::Gemv<int32_t> gemv_ref;
+  mace::ops::ref::Gemv<int32_t> gemv_ref =
+      mace::ops::ref::Gemv<int32_t>(DelegatorParam());
   gemv_ref.Compute(nullptr,
                    &lhs,
                    &rhs,
@@ -130,7 +132,8 @@ void TestGemvUint8(const index_t batch,
   thread_pool.Init();
   CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
   OpContext context(nullptr, &cpu_device);
-  mace::ops::arm::q8::Gemv<uint8_t> gemv;
+  mace::ops::arm::q8::Gemv<uint8_t> gemv =
+      mace::ops::arm::q8::Gemv<uint8_t>(DelegatorParam());
   gemv.Compute(&context,
                &lhs,
                &rhs,
@@ -146,7 +149,8 @@ void TestGemvUint8(const index_t batch,
   expected_output.SetScale(0.6);
   expected_output.SetZeroPoint(57);
   expected_output.Resize({batch, height});
-  mace::ops::ref::Gemv<uint8_t> gemv_ref;
+  mace::ops::ref::Gemv<uint8_t> gemv_ref =
+      mace::ops::ref::Gemv<uint8_t>(DelegatorParam());
   gemv_ref.Compute(nullptr,
                    &lhs,
                    &rhs,
diff --git a/test/ccunit/mace/ops/matmul_test.cc b/test/ccunit/mace/ops/matmul_test.cc
index d0432bb0b958ae6ee452b976b5c403e4bb4c04ba..9d46f0e1d97391e6dbf539f0cbee21b29918a1fc 100644
--- a/test/ccunit/mace/ops/matmul_test.cc
+++ b/test/ccunit/mace/ops/matmul_test.cc
@@ -14,6 +14,7 @@
 
 #include <fstream>
 
+#include "mace/ops/delegator/gemm.h"
 #include "mace/ops/ops_test_util.h"
 #include "mace/ops/ref/gemm.h"
 
@@ -111,7 +112,7 @@ void Complex(const std::vector<index_t> &batch,
       .Finalize(net.NewOperatorDef());
   net.RunOp(CPU);
 
-  ref::Gemm<float> gemm;
+  ref::Gemm<float> gemm = ref::Gemm<float>(delegator::GemmParam());
   Tensor expected_output_tensor;
   std::vector<index_t> expected_output_shape({rows, cols});
   expected_output_shape.insert(expected_output_shape.begin(),
diff --git a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
index 3dfe468a8db889418c48a15776e79adccadf9319..808ea9aa7f89905a890dbb67493a1f9c3922269c 100644
--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
@@ -16,7 +16,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
@@ -134,7 +134,7 @@ TEST(OutOfRangeCheckTest, RandomTest) {
   std::unique_ptr<Device> device = make_unique<GPUDevice>(
       gpu_context.opencl_tuner());
 
-  Workspace ws;
+  Workspace ws(nullptr);
   OpContext context(&ws, device.get());
 
   std::vector<index_t> buffer_shape = {batch, height, width, channels};
diff --git a/test/ccunit/mace/ops/sqrdiff_mean_test.cc b/test/ccunit/mace/ops/sqrdiff_mean_test.cc
index 3257987c7b9d8dc65a218059cd5c44ae9ab2e55d..42375b7df4e32a0ab55ce06730db5f1bf9280a03 100644
--- a/test/ccunit/mace/ops/sqrdiff_mean_test.cc
+++ b/test/ccunit/mace/ops/sqrdiff_mean_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/test/ccutils/mace/ops/ops_test_util.h b/test/ccutils/mace/ops/ops_test_util.h
index a9f8a9842890c4d9737040c89bd8d68c4fb5d7a1..e1e563426ded80603dff60b6415a9eb81ad62fcb 100644
--- a/test/ccutils/mace/ops/ops_test_util.h
+++ b/test/ccutils/mace/ops/ops_test_util.h
@@ -31,7 +31,9 @@
 #include "mace/core/device_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
-#include "mace/ops/registry/ops_registry.h"
+#include "mace/core/registry/ops_registry.h"
+#include "mace/core/registry/op_delegator_registry.h"
+#include "mace/ops/registry/registry.h"
 #include "mace/public/mace.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
@@ -109,7 +111,12 @@ class OpTestContext {
 class OpsTestNet {
  public:
   OpsTestNet() :
-    op_registry_(make_unique<OpRegistry>()) {}
+    op_registry_(make_unique<OpRegistry>()),
+    op_delegator_registry_(make_unique<OpDelegatorRegistry>()),
+    ws_(op_delegator_registry_.get()) {
+    ops::RegisterAllOps(op_registry_.get());
+    ops::RegisterAllOpDelegators(op_delegator_registry_.get());
+  }
 
   template <DeviceType D, typename T>
   void AddInputFromArray(const std::string &name,
@@ -426,7 +433,8 @@ class OpsTestNet {
   void Sync();
 
  public:
-  std::shared_ptr<OpRegistryBase> op_registry_;
+  std::unique_ptr<OpRegistry> op_registry_;
+  std::unique_ptr<OpDelegatorRegistry> op_delegator_registry_;
   Workspace ws_;
   std::vector<OperatorDef> op_defs_;
   std::unique_ptr<NetBase> net_;