refactor: refactor op base module and op delegator mechanism

N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>

refactor: refactor op base module and op delegator mechanism
N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>
2895c3ee · luxuhui · 28954099 · 2895c3ee · 2895c3ee · 2895c3ee
227 changed file
--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -19,7 +19,7 @@ Define the new Op class in `mace/ops/my_custom_op.cc`.
 The structure of Op is like the following code.
 ```c++
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 namespace mace {
 namespace ops {
@@ -39,7 +39,7 @@ class MyCustomOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterMyCustomOp(OpRegistryBase *op_registry) {
+void RegisterMyCustomOp(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                   DeviceType::CPU, float);
@@ -63,14 +63,14 @@ namespace ops {
 ...
-extern void RegisterMyCustomOp(OpRegistryBase *op_registry);
+extern void RegisterMyCustomOp(OpRegistry *op_registry);
 ...
 }  // namespace ops
-OpRegistry::OpRegistry() : OpRegistryBase() {
+OpRegistry::OpRegistry() {
  // Keep in lexicographical order
  ...

--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -557,7 +557,7 @@ which will reduce the library size significantly. the final binary just link the
    }  // namespace ops
-    OpRegistry::OpRegistry() : OpRegistryBase() {
+    OpRegistry::OpRegistry() {
    // Just leave the ops used in your models
      ...

--- a/docs/user_guide/advanced_usage_cmake.rst
+++ b/docs/user_guide/advanced_usage_cmake.rst
@@ -370,12 +370,13 @@ the sample code show how to calculate the Top-1 accuracy with imagenet validatio
 Reduce Library Size
 -------------------
-Remove the registration of the ops unused for your models in the ``mace/ops/ops_register.cc``,
+Remove the registration of the ops and delegators unused for your models in the
-which will reduce the library size significantly. the final binary just link the registered ops' code.
+``mace/ops/registry/ops_registry.cc`` and ``mace/ops/registry/op_delegators_registry.cc``,
+which will reduce the library size significantly. the final binary just link the registered ops and delegators' code.
 .. code-block:: cpp
-    #include "mace/ops/ops_register.h"
+    #include "mace/ops/registry/registry.h"
    namespace mace {
    namespace ops {
@@ -386,12 +387,38 @@ which will reduce the library size significantly. the final binary just link the
    }  // namespace ops
-    OpRegistry::OpRegistry() : OpRegistryBase() {
+    void RegisterAllOps(OpRegistry *registry) {
    // Just leave the ops used in your models
      ...
-      ops::RegisterMyCustomOp(this);
+      ops::RegisterMyCustomOp(registry);
+      ...
+    }
+    }  // namespace mace
+.. code-block:: cpp
+    #include "mace/ops/registry/registry.h"
+    namespace mace {
+    namespace ops {
+    // Just leave the delegators used in your ops
+    ...
+    }  // namespace ops
+    void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
+    // Just leave the delegators used in your ops
+      ...
+      ops::RegisterMyCustomDelegator(registry);
      ...

--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -26,6 +26,8 @@ cc_library(
    srcs = glob(
        [
            "*.cc",
+            "ops/*.cc",
+            "registry/*.cc",
            "runtime/cpu/*.cc",
        ],
        exclude = [
@@ -53,6 +55,8 @@ cc_library(
    hdrs = glob(
        [
            "*.h",
+            "ops/*.h",
+            "registry/*.h",
            "runtime/cpu/*.h",
        ],
        exclude = [
@@ -68,7 +72,7 @@ cc_library(
    ])) + if_hta_enabled(glob([
        "runtime/hexagon/*hta*.h",
    ])) + if_apu_enabled(glob([
-        "runtime/apu/*.h"
+        "runtime/apu/*.h",
    ])) + if_rpcmem_enabled([
        "rpcmem.h",
    ]),

--- a/mace/core/CMakeLists.txt
+++ b/mace/core/CMakeLists.txt
@@ -8,9 +8,16 @@ set(CORE_SRCS
  net.cc
  net_def_adapter.cc
  net_optimizer.cc
-  op_context.cc
+  ops/op_condition_builder.cc
-  operator.cc
+  ops/op_condition_context.cc
+  ops/op_construct_context.cc
+  ops/op_context.cc
+  ops/operator.cc
+  ops/op_init_context.cc
  quantize.cc
+  registry/op_delegator_registry.cc
+  registry/op_registration_info.cc
+  registry/ops_registry.cc
  runtime_failure_mock.cc
  types.cc
  workspace.cc

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "mace/core/net.h"
 #include <algorithm>
 #include <limits>
 #include <set>
@@ -20,8 +22,9 @@
 #include "mace/core/future.h"
 #include "mace/core/memory_optimizer.h"
-#include "mace/core/net.h"
+#include "mace/core/ops/op_init_context.h"
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/public/mace.h"
 #include "mace/port/env.h"
 #include "mace/utils/conf_util.h"
@@ -33,7 +36,7 @@
 namespace mace {
-SerialNet::SerialNet(const OpRegistryBase *op_registry,
+SerialNet::SerialNet(const OpRegistry *op_registry,
                     const NetDef *net_def,
                     Workspace *ws,
                     Device *target_device,

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -21,13 +21,14 @@
 #include <unordered_map>
 #include <sstream>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 namespace mace {
 class RunMetadata;
 class Workspace;
 class MemoryOptimizer;
+class OpRegistry;
 class NetBase {
 public:
@@ -44,7 +45,7 @@ class NetBase {
 class SerialNet : public NetBase {
 public:
-  SerialNet(const OpRegistryBase *op_registry,
+  SerialNet(const OpRegistry *op_registry,
            const NetDef *net_def,
            Workspace *ws,
            Device *target_device,

--- a/mace/core/net_def_adapter.cc
+++ b/mace/core/net_def_adapter.cc
@@ -17,7 +17,9 @@
 #include <string>
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/ops/op_condition_context.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_util.h"
@@ -82,7 +84,7 @@ void BuildTransposeOpDef(
 }  // namespace
-NetDefAdapter::NetDefAdapter(const OpRegistryBase *op_registry,
+NetDefAdapter::NetDefAdapter(const OpRegistry *op_registry,
                             const Workspace *ws)
    : op_registry_(op_registry), ws_(ws) {}

--- a/mace/core/net_def_adapter.h
+++ b/mace/core/net_def_adapter.h
@@ -23,14 +23,17 @@
 #include "mace/core/types.h"
 #include "mace/proto/mace.pb.h"
 #include "mace/port/port.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/core/net_optimizer.h"
 namespace mace {
-class OpRegistryBase;
-class Workspace;
 class Device;
+class OpConditionContext;
+class OperatorDef;
+class OpRegistry;
+class Workspace;
 ///////////////////////////////////////////////////////////////////////////////
 ///                               Conventions
@@ -49,7 +52,7 @@ class Device;
 ///////////////////////////////////////////////////////////////////////////////
 class NetDefAdapter {
 public:
-  NetDefAdapter(const OpRegistryBase *op_registry,
+  NetDefAdapter(const OpRegistry *op_registry,
                const Workspace *ws);
  // Adapt original net_def to a better net.
  // 1. Adapt device: choose best device for every op in the net.
@@ -122,7 +125,7 @@ class NetDefAdapter {
  std::string DebugString(const NetDef *net_def);
 private:
-  const OpRegistryBase *op_registry_;
+  const OpRegistry *op_registry_;
  const Workspace *ws_;
  NetOptimizer net_optimizer_;
 };

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef MACE_CORE_OPERATOR_H_
-#define MACE_CORE_OPERATOR_H_
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "mace/core/arg_helper.h"
-#include "mace/core/op_context.h"
-#include "mace/core/tensor.h"
-#include "mace/core/workspace.h"
-#include "mace/proto/mace.pb.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_util.h"
-#endif  // MACE_ENABLE_OPENCL
-namespace mace {
-// OpConditionContext has all information used for choosing proper Op
-class OpConditionContext {
- public:
-  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
-  OpConditionContext(const Workspace *ws, TensorShapeMap *info);
-  ~OpConditionContext() = default;
-  void set_operator_def(const OperatorDef *operator_def);
-  inline const OperatorDef *operator_def() const {
-    return operator_def_;
-  }
-  inline const Workspace *workspace() const {
-    return ws_;
-  }
-  inline void set_device(Device *device) {
-    device_ = device;
-  }
-  inline Device *device() const {
-    return device_;
-  }
-  inline TensorShapeMap *tensor_shape_info() const {
-    return tensor_shape_info_;
-  }
-  void set_output_mem_type(MemoryType type);
-  inline MemoryType output_mem_type() const {
-    return output_mem_type_;
-  }
-  void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt);
-  MemoryType GetInputMemType(size_t idx) const;
-  DataType GetInputDataType(size_t idx) const;
-#ifdef MACE_ENABLE_OPENCL
-  void SetInputOpenCLBufferType(size_t idx, OpenCLBufferType buffer_type);
-  OpenCLBufferType GetInputOpenCLBufferType(size_t idx) const;
-#endif  // MACE_ENABLE_OPENCL
- private:
-  const OperatorDef *operator_def_;
-  const Workspace *ws_;
-  Device *device_;
-  TensorShapeMap *tensor_shape_info_;
-  // used for memory transform
-  std::vector<MemoryType> input_mem_types_;
-  std::vector<DataType> input_data_types_;
-  MemoryType output_mem_type_;  // there is only one output memory type now.
-#ifdef MACE_ENABLE_OPENCL
-  std::vector<OpenCLBufferType> input_opencl_buffer_types_;
-#endif  // MACE_ENABLE_OPENCL
-};
-// memory_optimizer, device
-class OpConstructContext {
-  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
- public:
-  explicit OpConstructContext(Workspace *ws);
-  ~OpConstructContext() = default;
-  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
-  inline std::shared_ptr<OperatorDef> operator_def() const {
-    return operator_def_;
-  }
-  inline Workspace *workspace() const {
-    return ws_;
-  }
-  inline void set_device(Device *device) {
-    device_ = device;
-  }
-  inline Device *device() const {
-    return device_;
-  }
-#ifdef MACE_ENABLE_OPENCL
-  inline MemoryType GetOpMemoryType() const {
-    return static_cast<MemoryType>(
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            *operator_def_, OutputMemoryTypeTagName(),
-            static_cast<int>(MemoryType::CPU_BUFFER)));
-  }
-#endif  // MACE_ENABLE_OPENCL
- private:
-  std::shared_ptr<OperatorDef> operator_def_;
-  Workspace *ws_;
-  Device *device_;
-};
-// memory_optimizer, device
-class OpInitContext {
- public:
-  explicit OpInitContext(Workspace *ws, Device *device = nullptr);
-  ~OpInitContext() = default;
-  inline Workspace *workspace() const {
-    return ws_;
-  }
-  inline void set_device(Device *device) {
-    device_ = device;
-  }
-  inline Device *device() const {
-    return device_;
-  }
- private:
-  Workspace *ws_;
-  Device *device_;
-};
-// Conventions
-// * If there exist format, NHWC is the default format
-// * The input/output format of CPU ops with float data type is NCHW
-// * The input/output format of GPU ops and CPU Quantization ops is NHWC
-// * Inputs' data type is same as the operation data type by default.
-// * The outputs' data type is same as the operation data type by default.
-class Operation {
- public:
-  explicit Operation(OpConstructContext *context);
-  virtual ~Operation() = default;
-  template<typename T>
-  inline T GetOptionalArg(const std::string &name,
-                          const T &default_value) const {
-    MACE_CHECK(operator_def_, "operator_def was null!");
-    return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
-        *operator_def_, name, default_value);
-  }
-  template<typename T>
-  inline std::vector<T> GetRepeatedArgs(
-      const std::string &name, const std::vector<T> &default_value = {}) const {
-    MACE_CHECK(operator_def_, "operator_def was null!");
-    return ProtoArgHelper::GetRepeatedArgs<OperatorDef, T>(
-        *operator_def_, name, default_value);
-  }
-  inline DeviceType device_type() const {
-    return static_cast<DeviceType>(operator_def_->device_type());
-  }
-  inline const Tensor *Input(unsigned int idx) {
-    MACE_CHECK(idx < inputs_.size());
-    return inputs_[idx];
-  }
-  inline Tensor *Output(int idx) { return outputs_[idx]; }
-  inline int InputSize() { return inputs_.size(); }
-  inline int OutputSize() { return outputs_.size(); }
-  inline const std::vector<const Tensor *> &Inputs() const { return inputs_; }
-  inline const std::vector<Tensor *> &Outputs() { return outputs_; }
-  // Run Op asynchronously (depends on device), return a future if not nullptr.
-  virtual MaceStatus Init(OpInitContext *);
-  virtual MaceStatus Run(OpContext *) = 0;
-  inline const OperatorDef &debug_def() const {
-    MACE_CHECK(has_debug_def(), "operator_def was null!");
-    return *operator_def_;
-  }
-  inline void set_debug_def(
-      const std::shared_ptr<OperatorDef> &operator_def) {
-    operator_def_ = operator_def;
-  }
-  inline bool has_debug_def() const { return operator_def_ != nullptr; }
-  inline std::shared_ptr<OperatorDef> operator_def() {
-    return operator_def_;
-  }
- protected:
-  std::shared_ptr<OperatorDef> operator_def_;
-  std::vector<const Tensor *> inputs_;
-  std::vector<Tensor *> outputs_;
-  MACE_DISABLE_COPY_AND_ASSIGN(Operation);
-};
-// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
-// indices of the operator's inputs and outputs, in order to avoid confusion.
-// For example, for a fully convolution layer that has input, weight and bias,
-// you can define its input tags as:
-//     MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
-// And in the code, instead of doing
-//     auto& weight = Input(1);
-// you can now do
-//     auto& weight = Input(WEIGHT);
-// to make it more clear.
-#define MACE_OP_INPUT_TAGS(first_input, ...) \
-  enum _InputTags { first_input = 0, __VA_ARGS__ }
-#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
-  enum _OutputTags { first_input = 0, __VA_ARGS__ }
-struct OpRegistrationInfo {
- public:
-  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
-      OpCreator;
-  typedef std::function<std::set<DeviceType>(OpConditionContext *)>
-      DevicePlacer;
-  typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
-  typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
-      DataFormatSelector;
-  OpRegistrationInfo();
-  void AddDevice(DeviceType);
-  void Register(const std::string &key, OpCreator creator);
-  std::set<DeviceType> devices;
-  std::unordered_map<std::string, OpCreator> creators;
-  DevicePlacer device_placer;
-  MemoryTypeSetter memory_type_setter;
-  DataFormatSelector data_format_selector;
-};
-class OpConditionBuilder {
- public:
-  explicit OpConditionBuilder(const std::string &type);
-  const std::string type() const;
-  OpConditionBuilder &SetDevicePlacerFunc(
-      OpRegistrationInfo::DevicePlacer placer);
-  // If you set input memory type for specified Op,
-  // you must call OpConditionContext::set_output_mem_type
-  OpConditionBuilder &SetInputMemoryTypeSetter(
-      OpRegistrationInfo::MemoryTypeSetter setter);
-  OpConditionBuilder &SetInputsDataFormatSelector(
-      OpRegistrationInfo::DataFormatSelector selector);
-  void Finalize(OpRegistrationInfo *info) const;
- private:
-  std::string type_;
-  OpRegistrationInfo::DevicePlacer placer_;
-  OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
-  OpRegistrationInfo::DataFormatSelector data_format_selector_;
-};
-class OpRegistryBase {
- public:
-  OpRegistryBase() = default;
-  virtual ~OpRegistryBase() = default;
-  MaceStatus Register(const std::string &op_type,
-                      const DeviceType device_type,
-                      const DataType dt,
-                      OpRegistrationInfo::OpCreator creator);
-  MaceStatus Register(const OpConditionBuilder &builder);
-  const std::set<DeviceType> AvailableDevices(
-      const std::string &op_type, OpConditionContext *context) const;
-  void GetInOutMemoryTypes(
-      const std::string &op_type, OpConditionContext *context) const;
-  const std::vector<DataFormat> InputsDataFormat(
-      const std::string &op_type, OpConditionContext *context) const;
-  std::unique_ptr<Operation> CreateOperation(
-      OpConstructContext *context,
-      DeviceType device_type) const;
-  template<class DerivedType>
-  static std::unique_ptr<Operation> DefaultCreator(
-      OpConstructContext *context) {
-    return std::unique_ptr<Operation>(new DerivedType(context));
-  }
- private:
-  std::unordered_map<
-      std::string,
-      std::unique_ptr<OpRegistrationInfo>> registry_;
-  MACE_DISABLE_COPY_AND_ASSIGN(OpRegistryBase);
-};
-#define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
-  op_registry->Register(op_type,                                       \
-                        device,                                        \
-                        DataTypeToEnum<dt>::value,                     \
-                        OpRegistryBase::DefaultCreator<class_name<device, dt>>)
-#define MACE_REGISTER_OP_BY_CLASS(                 \
-    op_registry, op_type, class_name, device, dt)  \
-  op_registry->Register(op_type,                   \
-                        device,                    \
-                        DataTypeToEnum<dt>::value, \
-                        OpRegistryBase::DefaultCreator<class_name>)
-#ifdef MACE_ENABLE_OPENCL
-#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
-  op_registry->Register(                                       \
-      op_type,                                                 \
-      DeviceType::GPU,                                         \
-      DT_FLOAT,                                                \
-      OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
-#else
-#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
-#endif
-#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
-  op_registry->Register(builder)
-}  // namespace mace
-#endif  // MACE_CORE_OPERATOR_H_
--- a/mace/ops/arm/fp32/conv_general.h
+++ b/mace/ops/arm/fp32/conv_general.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,39 +12,48 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+#include "mace/core/ops/op_condition_builder.h"
-#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_
-#include <vector>
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
 namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-class Conv2dGeneral : public Conv2dBase {
- public:
-  Conv2dGeneral(const std::vector<int> &strides,
-                const std::vector<int> &dilations,
-                const std::vector<int> &paddings,
-                const Padding padding_type)
-      : Conv2dBase(strides, dilations, paddings, padding_type) {}
-  virtual ~Conv2dGeneral() {}
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+OpConditionBuilder::OpConditionBuilder(const std::string &type)
+    : type_(type) {}
+const std::string OpConditionBuilder::type() const {
+  return type_;
+}
+OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
+    OpRegistrationInfo::DevicePlacer placer) {
+  placer_ = placer;
+  return *this;
+}
+OpConditionBuilder &OpConditionBuilder::SetInputMemoryTypeSetter(
+    OpRegistrationInfo::MemoryTypeSetter setter) {
+  memory_type_setter_ = setter;
+  return *this;
+}
+OpConditionBuilder &OpConditionBuilder::SetInputsDataFormatSelector(
+    OpRegistrationInfo::DataFormatSelector selector) {
+  data_format_selector_ = selector;
+  return *this;
+}
+void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
+  if (info != nullptr) {
+    if (placer_) {
+      info->device_placer = placer_;
+    }
+    if (memory_type_setter_) {
+      info->memory_type_setter = memory_type_setter_;
+    }
+    if (data_format_selector_) {
+      info->data_format_selector = data_format_selector_;
+    }
+  }
+}
+}  // namespace mace
--- a/mace/core/ops/op_condition_builder.h
+++ b/mace/core/ops/op_condition_builder.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_OPS_OP_CONDITION_BUILDER_H_
+#define MACE_CORE_OPS_OP_CONDITION_BUILDER_H_
+#include <memory>
+#include <string>
+#include "mace/core/registry/op_registration_info.h"
+#include "mace/core/types.h"
+namespace mace {
+class OpConditionBuilder {
+ public:
+  explicit OpConditionBuilder(const std::string &type);
+  const std::string type() const;
+  OpConditionBuilder &SetDevicePlacerFunc(
+      OpRegistrationInfo::DevicePlacer placer);
+  // If you set input memory type for specified Op,
+  // you must call OpConditionContext::set_output_mem_type
+  OpConditionBuilder &SetInputMemoryTypeSetter(
+      OpRegistrationInfo::MemoryTypeSetter setter);
+  OpConditionBuilder &SetInputsDataFormatSelector(
+      OpRegistrationInfo::DataFormatSelector selector);
+  void Finalize(OpRegistrationInfo *info) const;
+ private:
+  std::string type_;
+  OpRegistrationInfo::DevicePlacer placer_;
+  OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
+  OpRegistrationInfo::DataFormatSelector data_format_selector_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_OPS_OP_CONDITION_BUILDER_H_
--- a/mace/core/ops/op_condition_context.cc
+++ b/mace/core/ops/op_condition_context.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/ops/op_condition_context.h"
+#include "mace/core/arg_helper.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/utils/logging.h"
+namespace mace {
+OpConditionContext::OpConditionContext(
+    const Workspace *ws,
+    OpConditionContext::TensorShapeMap *info)
+    : operator_def_(nullptr),
+      ws_(ws),
+      device_(nullptr),
+      tensor_shape_info_(info) {}
+void OpConditionContext::set_operator_def(
+    const OperatorDef *operator_def) {
+  operator_def_ = operator_def;
+  input_data_types_.clear();
+}
+void OpConditionContext::SetInputInfo(size_t idx,
+                                      MemoryType mem_type,
+                                      DataType dt) {
+  if (input_mem_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
+  }
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    DataType op_dt = static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+    input_data_types_.resize(operator_def_->input_size(), op_dt);
+  }
+  MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size());
+  input_mem_types_[idx] = mem_type;
+  input_data_types_[idx] = dt;
+}
+void OpConditionContext::set_output_mem_type(MemoryType type) {
+  MACE_CHECK(operator_def_ != nullptr);
+  output_mem_type_ = type;
+  input_mem_types_.clear();
+}
+MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
+  if (input_mem_types_.empty()) {
+    return output_mem_type_;
+  }
+  MACE_CHECK(idx < input_mem_types_.size(),
+             idx, " < ", input_mem_types_.size());
+  return input_mem_types_[idx];
+}
+DataType OpConditionContext::GetInputDataType(size_t idx) const {
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    return static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+  }
+  MACE_CHECK(idx < input_data_types_.size());
+  return input_data_types_[idx];
+}
+#ifdef MACE_ENABLE_OPENCL
+void OpConditionContext::SetInputOpenCLBufferType(
+    size_t idx, OpenCLBufferType buffer_type) {
+  if (input_opencl_buffer_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_opencl_buffer_types_.resize(operator_def_->input_size(),
+                                      OpenCLBufferType::IN_OUT_CHANNEL);
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  input_opencl_buffer_types_[idx] = buffer_type;
+}
+OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
+    size_t idx) const {
+  if (input_opencl_buffer_types_.empty()) {
+    return OpenCLBufferType::IN_OUT_CHANNEL;
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  return input_opencl_buffer_types_[idx];
+}
+#endif  // MACE_ENABLE_OPENCL
+}  // namespace mace
--- a/mace/core/ops/op_condition_context.h
+++ b/mace/core/ops/op_condition_context.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_OPS_OP_CONDITION_CONTEXT_H_
+#define MACE_CORE_OPS_OP_CONDITION_CONTEXT_H_
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "mace/core/types.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+namespace mace {
+class Workspace;
+class Device;
+// OpConditionContext has all information used for choosing proper Op
+class OpConditionContext {
+ public:
+  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
+  OpConditionContext(const Workspace *ws, TensorShapeMap *info);
+  ~OpConditionContext() = default;
+  void set_operator_def(const OperatorDef *operator_def);
+  const OperatorDef *operator_def() const {
+    return operator_def_;
+  }
+  const Workspace *workspace() const {
+    return ws_;
+  }
+  void set_device(Device *device) {
+    device_ = device;
+  }
+  Device *device() const {
+    return device_;
+  }
+  TensorShapeMap *tensor_shape_info() const {
+    return tensor_shape_info_;
+  }
+  void set_output_mem_type(MemoryType type);
+  MemoryType output_mem_type() const {
+    return output_mem_type_;
+  }
+  void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt);
+  MemoryType GetInputMemType(size_t idx) const;
+  DataType GetInputDataType(size_t idx) const;
+#ifdef MACE_ENABLE_OPENCL
+  void SetInputOpenCLBufferType(size_t idx, OpenCLBufferType buffer_type);
+  OpenCLBufferType GetInputOpenCLBufferType(size_t idx) const;
+#endif  // MACE_ENABLE_OPENCL
+ private:
+  const OperatorDef *operator_def_;
+  const Workspace *ws_;
+  Device *device_;
+  TensorShapeMap *tensor_shape_info_;
+  // used for memory transform
+  std::vector<MemoryType> input_mem_types_;
+  std::vector<DataType> input_data_types_;
+  MemoryType output_mem_type_;  // there is only one output memory type now.
+#ifdef MACE_ENABLE_OPENCL
+  std::vector<OpenCLBufferType> input_opencl_buffer_types_;
+#endif  // MACE_ENABLE_OPENCL
+};
+}  // namespace mace
+#endif  // MACE_CORE_OPS_OP_CONDITION_CONTEXT_H_
--- a/mace/core/ops/op_construct_context.cc
+++ b/mace/core/ops/op_construct_context.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/ops/op_construct_context.h"
+namespace mace {
+OpConstructContext::OpConstructContext(Workspace *ws)
+    : operator_def_(nullptr),
+      ws_(ws),
+      device_(nullptr) {}
+void OpConstructContext::set_operator_def(
+    std::shared_ptr<OperatorDef> operator_def) {
+  operator_def_ = operator_def;
+}
+}  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_general.h
+++ b/mace/ops/arm/fp32/deconv_2d_general.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,49 +12,62 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
+#ifndef MACE_CORE_OPS_OP_CONSTRUCT_CONTEXT_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
+#define MACE_CORE_OPS_OP_CONSTRUCT_CONTEXT_H_
-#include <vector>
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
-#include "mace/public/mace.h"
+#include "mace/core/arg_helper.h"
-#include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
+#include "mace/proto/mace.pb.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
-#include "mace/ops/common/conv_pool_2d_util.h"
 namespace mace {
-namespace ops {
+class Device;
-namespace arm {
+class Workspace;
-namespace fp32 {
+// memory_optimizer, device
+class OpConstructContext {
+  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
-class Deconv2dGeneral : public Deconv2dBase {
 public:
-  Deconv2dGeneral(const std::vector<int> &strides,
+  explicit OpConstructContext(Workspace *ws);
-                  const std::vector<int> &dilations,
+  ~OpConstructContext() = default;
-                  const std::vector<int> &paddings,
-                  const Padding padding_type,
+  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
-                  const FrameworkType framework_type)
-      : Deconv2dBase(strides,
+  std::shared_ptr<OperatorDef> operator_def() const {
-                     dilations,
+    return operator_def_;
-                     paddings,
+  }
-                     padding_type,
-                     framework_type) {}
+  Workspace *workspace() const {
-  virtual ~Deconv2dGeneral() {}
+    return ws_;
+  }
-  MaceStatus Compute(
-      const OpContext *context,
+  void set_device(Device *device) {
-      const Tensor *input,
+    device_ = device;
-      const Tensor *filter,
+  }
-      const Tensor *output_shape,
-      Tensor *output) override;
+  Device *device() const {
+    return device_;
+  }
+#ifdef MACE_ENABLE_OPENCL
+  inline MemoryType GetOpMemoryType() const {
+    return static_cast<MemoryType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, OutputMemoryTypeTagName(),
+            static_cast<int>(MemoryType::CPU_BUFFER)));
+  }
+#endif  // MACE_ENABLE_OPENCL
+ private:
+  std::shared_ptr<OperatorDef> operator_def_;
+  Workspace *ws_;
+  Device *device_;
 };
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
+#endif  // MACE_CORE_OPS_OP_CONSTRUCT_CONTEXT_H_
--- a/mace/core/op_context.cc
+++ b/mace/core/op_context.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 namespace mace {

--- a/mace/core/op_context.h
+++ b/mace/core/op_context.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_CORE_OP_CONTEXT_H_
+#ifndef MACE_CORE_OPS_OP_CONTEXT_H_
-#define MACE_CORE_OP_CONTEXT_H_
+#define MACE_CORE_OPS_OP_CONTEXT_H_
 #include "mace/core/device.h"
 #include "mace/core/workspace.h"
@@ -35,8 +35,7 @@ class OpContext {
  Device *device_;
  Workspace *ws_;
  StatsFuture *future_;
-  // metadata
 };
 }  // namespace mace
-#endif  // MACE_CORE_OP_CONTEXT_H_
+#endif  // MACE_CORE_OPS_OP_CONTEXT_H_
--- a/mace/ops/ref/activation.h
+++ b/mace/ops/ref/activation.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,40 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_REF_ACTIVATION_H_
+#ifndef MACE_CORE_OPS_OP_DELEGATOR_H_
-#define MACE_OPS_REF_ACTIVATION_H_
+#define MACE_CORE_OPS_OP_DELEGATOR_H_
-#include "mace/core/op_context.h"
+#include <memory>
-#include "mace/ops/common/activation_type.h"
+#include "mace/utils/macros.h"
+#include "mace/utils/memory.h"
 namespace mace {
-namespace ops {
-namespace ref {
-class Activation {
+enum ImplType {
+  REF = 0,
+  NEON,
+};
+#ifdef MACE_ENABLE_NEON
+#define MACE_CPU_IMPL_TYPE NEON
+#else
+#define MACE_CPU_IMPL_TYPE REF
+#endif
+struct DelegatorParam {
+ public:
+  DelegatorParam() = default;
+  virtual ~DelegatorParam() = default;
+};
+class OpDelegator {
 public:
-  explicit Activation(ActivationType type,
+  explicit OpDelegator(const DelegatorParam &param) {
-                      const float limit,
+    MACE_UNUSED(param);
-                      const float leakyrelu_coefficient);
+  }
-  ~Activation() = default;
+  virtual ~OpDelegator() = default;
-  MaceStatus Compute(
+  template<class DerivedType, class ParamType>
-      const OpContext *context,
+  static std::unique_ptr<OpDelegator> DefaultCreator(
-      const Tensor *input,
+      const DelegatorParam &param) {
-      Tensor *output);
+    return make_unique<DerivedType>(static_cast<const ParamType &>(param));
+  }
- private:
-  void DoActivation(const OpContext *context,
-                    const Tensor *input,
-                    Tensor *output);
-  ActivationType type_;
-  const float limit_;
-  const float leakyrelu_coefficient_;
 };
-}  // namespace ref
-}  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_REF_ACTIVATION_H_
+#endif  // MACE_CORE_OPS_OP_DELEGATOR_H_
--- a/mace/core/ops/op_init_context.cc
+++ b/mace/core/ops/op_init_context.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/ops/op_init_context.h"
+namespace mace {
+OpInitContext::OpInitContext(Workspace *ws, Device *device)
+    : ws_(ws), device_(device) {}
+}  // namespace mace
--- a/mace/core/ops/op_init_context.h
+++ b/mace/core/ops/op_init_context.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_OPS_OP_INIT_CONTEXT_H_
+#define MACE_CORE_OPS_OP_INIT_CONTEXT_H_
+namespace mace {
+class Workspace;
+class Device;
+// memory_optimizer, device
+class OpInitContext {
+ public:
+  explicit OpInitContext(Workspace *ws, Device *device = nullptr);
+  ~OpInitContext() = default;
+  Workspace *workspace() const {
+    return ws_;
+  }
+  void set_device(Device *device) {
+    device_ = device;
+  }
+  Device *device() const {
+    return device_;
+  }
+ private:
+  Workspace *ws_;
+  Device *device_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_OPS_OP_INIT_CONTEXT_H_
--- a/mace/core/ops/operator.cc
+++ b/mace/core/ops/operator.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/ops/operator.h"
+#include <vector>
+#include "mace/core/ops/op_construct_context.h"
+#include "mace/core/ops/op_init_context.h"
+namespace mace {
+Operation::Operation(OpConstructContext *context)
+    : operator_def_(context->operator_def()) {}
+MaceStatus Operation::Init(OpInitContext *context) {
+  Workspace *ws = context->workspace();
+  for (const std::string &input_str : operator_def_->input()) {
+    const Tensor *tensor = ws->GetTensor(input_str);
+    MACE_CHECK(tensor != nullptr, "op ", operator_def_->type(),
+               ": Encountered a non-existing input tensor: ", input_str);
+    inputs_.push_back(tensor);
+  }
+  for (int i = 0; i < operator_def_->output_size(); ++i) {
+    const std::string output_str = operator_def_->output(i);
+    if (ws->HasTensor(output_str)) {
+      outputs_.push_back(ws->GetTensor(output_str));
+    } else {
+      MACE_CHECK(
+          operator_def_->output_type_size() == 0 ||
+              operator_def_->output_size() == operator_def_->output_type_size(),
+          "operator output size != operator output type size",
+          operator_def_->output_size(),
+          operator_def_->output_type_size());
+      DataType output_type;
+      if (i < operator_def_->output_type_size()) {
+        output_type = operator_def_->output_type(i);
+      } else {
+        output_type = static_cast<DataType>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                *operator_def_, "T", static_cast<int>(DT_FLOAT)));
+      }
+      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
+          output_str, context->device()->allocator(), output_type)));
+    }
+    if (i < operator_def_->output_shape_size()) {
+      std::vector<index_t>
+          shape_configured(operator_def_->output_shape(i).dims_size());
+      for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+        shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
+      }
+      ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace mace
--- a/mace/core/ops/operator.h
+++ b/mace/core/ops/operator.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_OPS_OPERATOR_H_
+#define MACE_CORE_OPS_OPERATOR_H_
+#include <memory>
+#include <string>
+#include <vector>
+#include "mace/core/arg_helper.h"
+#include "mace/core/ops/op_construct_context.h"
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/proto/mace.pb.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+namespace mace {
+class OpInitContext;
+// Conventions
+// * If there exist format, NHWC is the default format
+// * The input/output format of CPU ops with float data type is NCHW
+// * The input/output format of GPU ops and CPU Quantization ops is NHWC
+// * Inputs' data type is same as the operation data type by default.
+// * The outputs' data type is same as the operation data type by default.
+class Operation {
+ public:
+  explicit Operation(OpConstructContext *context);
+  virtual ~Operation() = default;
+  template<typename T>
+  T GetOptionalArg(const std::string &name,
+                   const T &default_value) const {
+    MACE_CHECK(operator_def_, "operator_def was null!");
+    return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
+        *operator_def_, name, default_value);
+  }
+  template<typename T>
+  std::vector<T> GetRepeatedArgs(
+      const std::string &name, const std::vector<T> &default_value = {}) const {
+    MACE_CHECK(operator_def_, "operator_def was null!");
+    return ProtoArgHelper::GetRepeatedArgs<OperatorDef, T>(
+        *operator_def_, name, default_value);
+  }
+  DeviceType device_type() const {
+    return static_cast<DeviceType>(operator_def_->device_type());
+  }
+  const Tensor *Input(unsigned int idx) {
+    MACE_CHECK(idx < inputs_.size());
+    return inputs_[idx];
+  }
+  Tensor *Output(int idx) { return outputs_[idx]; }
+  int InputSize() { return inputs_.size(); }
+  int OutputSize() { return outputs_.size(); }
+  const std::vector<const Tensor *> &Inputs() const { return inputs_; }
+  const std::vector<Tensor *> &Outputs() { return outputs_; }
+  // Run Op asynchronously (depends on device), return a future if not nullptr.
+  virtual MaceStatus Init(OpInitContext *);
+  virtual MaceStatus Run(OpContext *) = 0;
+  const OperatorDef &debug_def() const {
+    MACE_CHECK(has_debug_def(), "operator_def was null!");
+    return *operator_def_;
+  }
+  void set_debug_def(
+      const std::shared_ptr<OperatorDef> &operator_def) {
+    operator_def_ = operator_def;
+  }
+  bool has_debug_def() const { return operator_def_ != nullptr; }
+  inline std::shared_ptr<OperatorDef> operator_def() {
+    return operator_def_;
+  }
+ protected:
+  std::shared_ptr<OperatorDef> operator_def_;
+  std::vector<const Tensor *> inputs_;
+  std::vector<Tensor *> outputs_;
+  MACE_DISABLE_COPY_AND_ASSIGN(Operation);
+};
+// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
+// indices of the operator's inputs and outputs, in order to avoid confusion.
+// For example, for a fully convolution layer that has input, weight and bias,
+// you can define its input tags as:
+//     MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
+// And in the code, instead of doing
+//     auto& weight = Input(1);
+// you can now do
+//     auto& weight = Input(WEIGHT);
+// to make it more clear.
+#define MACE_OP_INPUT_TAGS(first_input, ...) \
+  enum _InputTags { first_input = 0, __VA_ARGS__ }
+#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
+  enum _OutputTags { first_input = 0, __VA_ARGS__ }
+}  // namespace mace
+#endif  // MACE_CORE_OPS_OPERATOR_H_
--- a/mace/core/registry/op_delegator_registry.cc
+++ b/mace/core/registry/op_delegator_registry.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/registry/op_delegator_registry.h"
+#include <utility>
+#include "mace/utils/logging.h"
+namespace mace {
+MaceStatus OpDelegatorRegistry::Register(const std::string &key,
+                                         DelegatorCreator creator) {
+  MACE_CHECK(registry_.count(key) == 0, "Register an exist key.");
+  registry_[key] = std::move(creator);
+  return MaceStatus::MACE_SUCCESS;
+}
+DelegatorCreator OpDelegatorRegistry::GetCreator(const std::string &key) const {
+  MACE_CHECK(registry_.count(key) > 0, key, " not exist.");
+  return registry_.at(key);
+}
+template<> const char *DType<float>::name_ = "float";
+template<> const char *DType<int>::name_ = "int";
+template<> const char *DType<uint8_t>::name_ = "uint8_t";
+}  // namespace mace
--- a/mace/core/registry/op_delegator_registry.h
+++ b/mace/core/registry/op_delegator_registry.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_REGISTRY_OP_DELEGATOR_REGISTRY_H_
+#define MACE_CORE_REGISTRY_OP_DELEGATOR_REGISTRY_H_
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "mace/core/ops/op_delegator.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/public/mace.h"
+namespace mace {
+typedef std::function<std::unique_ptr<OpDelegator>(const DelegatorParam &)>
+    DelegatorCreator;
+class OpDelegatorRegistry {
+ public:
+  OpDelegatorRegistry() = default;
+  ~OpDelegatorRegistry() = default;
+  MaceStatus Register(const std::string &key, DelegatorCreator creator);
+  DelegatorCreator GetCreator(const std::string &key) const;
+ private:
+  std::unordered_map<std::string, DelegatorCreator> registry_;
+};
+template<typename T>
+struct DType { static const char *name_; };
+template<> const char *DType<float>::name_;
+template<> const char *DType<int>::name_;
+template<> const char *DType<uint8_t>::name_;
+}  // namespace mace
+#ifndef MACE_DELEGATOR_KEY_TMP
+#define MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl) \
+  (std::string(#delegator_name"_"#device"_"#impl"_") + DType<DT>::name_)
+#endif  // MACE_DELEGATOR_KEY_TMP
+#ifndef MACE_DELEGATOR_KEY
+#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
+  MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl)
+#endif  // MACE_DELEGATOR_KEY
+#ifndef MACE_DELEGATOR_KEY_EX_TMP
+#define MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) \
+  (std::string(#delegator_name"_"#device"_"#impl"_"#tag"_") + DType<DT>::name_)
+#endif  // MACE_DELEGATOR_KEY_EX_TMP
+#ifndef MACE_DELEGATOR_KEY_EX
+#define MACE_DELEGATOR_KEY_EX(delegator_name, device, DT, impl, tag) \
+  MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag)
+#endif  // MACE_DELEGATOR_KEY_EX
+#ifndef MACE_REGISTER_DELEGATOR
+#define MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key)  \
+  void Register##class_name##Delegator(OpDelegatorRegistry *registry) { \
+    registry->Register(                                                 \
+        key, OpDelegator::DefaultCreator<class_name, param_name>);      \
+  }
+#endif  // MACE_REGISTER_DELEGATOR
+#ifndef MACE_DEFINE_DELEGATOR_CREATOR
+#define MACE_DEFINE_DELEGATOR_CREATOR(class_name)            \
+  static std::unique_ptr<class_name> Create(                 \
+      Workspace *workspace, const std::string &tag,          \
+      const DelegatorParam &param) {                         \
+    DelegatorCreator creator =                               \
+        workspace->GetDelegatorRegistry()->GetCreator(tag);  \
+    std::unique_ptr<OpDelegator> delegator = creator(param); \
+    return  std::unique_ptr<class_name>(                     \
+        static_cast<class_name *>(delegator.release()));     \
+  }
+#endif  // MACE_DEFINE_DELEGATOR_CREATOR
+#endif  // MACE_CORE_REGISTRY_OP_DELEGATOR_REGISTRY_H_
--- a/mace/core/registry/op_registration_info.cc
+++ b/mace/core/registry/op_registration_info.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/registry/op_registration_info.h"
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "mace/core/ops/op_condition_context.h"
+namespace mace {
+OpRegistrationInfo::OpRegistrationInfo() {
+  // default device type placer
+  device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
+    MACE_UNUSED(context);
+    return this->devices;
+  };
+  // default input and output memory type setter
+  memory_type_setter = [](OpConditionContext *context) -> void {
+    if (context->device()->device_type() == DeviceType::GPU) {
+#ifdef MACE_ENABLE_OPENCL
+      if (context->device()->gpu_runtime()->UseImageMemory()) {
+        context->set_output_mem_type(MemoryType::GPU_IMAGE);
+      } else {
+        context->set_output_mem_type(MemoryType::GPU_BUFFER);
+      }
+#endif  // MACE_ENABLE_OPENCL
+    } else {
+      context->set_output_mem_type(MemoryType::CPU_BUFFER);
+    }
+  };
+  data_format_selector = [](OpConditionContext *context)
+      -> std::vector<DataFormat> {
+    DataFormat op_data_format =
+        static_cast<DataFormat>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                *context->operator_def(), "data_format",
+                static_cast<int>(DataFormat::NONE)));
+    return std::vector<DataFormat>(context->operator_def()->input_size(),
+                                   op_data_format);
+  };
+}
+void OpRegistrationInfo::AddDevice(DeviceType device) {
+  devices.insert(device);
+}
+void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
+  VLOG(3) << "Registering: " << key;
+  MACE_CHECK(creators.count(key) == 0, "Key already registered: ", key);
+  creators[key] = std::move(creator);
+}
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_1x1.h
+++ b/mace/ops/arm/fp32/conv_2d_1x1.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,40 +12,45 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
+#ifndef MACE_CORE_REGISTRY_OP_REGISTRATION_INFO_H_
+#define MACE_CORE_REGISTRY_OP_REGISTRATION_INFO_H_
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
 #include <vector>
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
+#include "mace/core/ops/operator.h"
-#include "mace/core/op_context.h"
+#include "mace/proto/mace.pb.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
 namespace mace {
-namespace ops {
+class OpConstructContext;
-namespace arm {
+class OpConditionContext;
-namespace fp32 {
-class Conv2dK1x1 : public Conv2dBase {
+class OpRegistrationInfo {
 public:
-  Conv2dK1x1(const std::vector<int> &paddings, const Padding padding_type)
+  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+      OpCreator;
-  virtual ~Conv2dK1x1() {}
+  typedef std::function<std::set<DeviceType>(OpConditionContext *)>
+      DevicePlacer;
-  MaceStatus Compute(
+  typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
-      const OpContext *context,
+  typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
-      const Tensor *input,
+      DataFormatSelector;
-      const Tensor *filter,
-      Tensor *output) override;
+  OpRegistrationInfo();
- private:
-  Gemm gemm_;
-};
-}  // namespace fp32
+  void AddDevice(DeviceType);
-}  // namespace arm
-}  // namespace ops
+  void Register(const std::string &key, OpCreator creator);
+  std::set<DeviceType> devices;
+  std::unordered_map<std::string, OpCreator> creators;
+  DevicePlacer device_placer;
+  MemoryTypeSetter memory_type_setter;
+  DataFormatSelector data_format_selector;
+};
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
+#endif  // MACE_CORE_REGISTRY_OP_REGISTRATION_INFO_H_
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,153 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <sstream>
+#include "mace/core/registry/ops_registry.h"
 #include <map>
 #include <memory>
+#include <set>
+#include <string>
 #include <vector>
-#include "mace/core/operator.h"
 namespace mace {
-OpConditionContext::OpConditionContext(
-    const Workspace *ws,
-    OpConditionContext::TensorShapeMap *info)
-    : operator_def_(nullptr),
-      ws_(ws),
-      device_(nullptr),
-      tensor_shape_info_(info) {}
-void OpConditionContext::set_operator_def(
-    const OperatorDef *operator_def) {
-  operator_def_ = operator_def;
-  input_data_types_.clear();
-}
-void OpConditionContext::SetInputInfo(size_t idx,
-                                      MemoryType mem_type,
-                                      DataType dt) {
-  if (input_mem_types_.empty()) {
-    // the default inputs' memory types are same as output memory type.
-    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
-  }
-  if (input_data_types_.empty()) {
-    // the default inputs' data types are same as operation's data type.
-    DataType op_dt = static_cast<DataType>(
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
-    input_data_types_.resize(operator_def_->input_size(), op_dt);
-  }
-  MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size());
-  input_mem_types_[idx] = mem_type;
-  input_data_types_[idx] = dt;
-}
-void OpConditionContext::set_output_mem_type(MemoryType type) {
-  MACE_CHECK(operator_def_ != nullptr);
-  output_mem_type_ = type;
-  input_mem_types_.clear();
-}
-MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
-  if (input_mem_types_.empty()) {
-    return output_mem_type_;
-  }
-  MACE_CHECK(idx < input_mem_types_.size(),
-             idx, " < ", input_mem_types_.size());
-  return input_mem_types_[idx];
-}
-DataType OpConditionContext::GetInputDataType(size_t idx) const {
-  if (input_data_types_.empty()) {
-    // the default inputs' data types are same as operation's data type.
-    return static_cast<DataType>(
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
-  }
-  MACE_CHECK(idx < input_data_types_.size());
-  return input_data_types_[idx];
-}
-#ifdef MACE_ENABLE_OPENCL
-void OpConditionContext::SetInputOpenCLBufferType(
-    size_t idx, OpenCLBufferType buffer_type) {
-  if (input_opencl_buffer_types_.empty()) {
-    // the default inputs' memory types are same as output memory type.
-    input_opencl_buffer_types_.resize(operator_def_->input_size(),
-                                      OpenCLBufferType::IN_OUT_CHANNEL);
-  }
-  MACE_CHECK(idx < input_opencl_buffer_types_.size());
-  input_opencl_buffer_types_[idx] = buffer_type;
-}
-OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
-    size_t idx) const {
-  if (input_opencl_buffer_types_.empty()) {
-    return OpenCLBufferType::IN_OUT_CHANNEL;
-  }
-  MACE_CHECK(idx < input_opencl_buffer_types_.size());
-  return input_opencl_buffer_types_[idx];
-}
-#endif  // MACE_ENABLE_OPENCL
-OpConstructContext::OpConstructContext(Workspace *ws)
-    : operator_def_(nullptr),
-      ws_(ws),
-      device_(nullptr) {}
-void OpConstructContext::set_operator_def(
-    std::shared_ptr<OperatorDef> operator_def) {
-  operator_def_ = operator_def;
-}
-OpInitContext::OpInitContext(Workspace *ws, Device *device)
-    : ws_(ws), device_(device) {}
-Operation::Operation(OpConstructContext *context)
-    : operator_def_(context->operator_def()) {}
-MaceStatus Operation::Init(OpInitContext *context) {
-  Workspace *ws = context->workspace();
-  for (const std::string &input_str : operator_def_->input()) {
-    const Tensor *tensor = ws->GetTensor(input_str);
-    MACE_CHECK(tensor != nullptr, "op ", operator_def_->type(),
-               ": Encountered a non-existing input tensor: ", input_str);
-    inputs_.push_back(tensor);
-  }
-  for (int i = 0; i < operator_def_->output_size(); ++i) {
-    const std::string output_str = operator_def_->output(i);
-    if (ws->HasTensor(output_str)) {
-      outputs_.push_back(ws->GetTensor(output_str));
-    } else {
-      MACE_CHECK(
-          operator_def_->output_type_size() == 0 ||
-              operator_def_->output_size() == operator_def_->output_type_size(),
-          "operator output size != operator output type size",
-          operator_def_->output_size(),
-          operator_def_->output_type_size());
-      DataType output_type;
-      if (i < operator_def_->output_type_size()) {
-        output_type = operator_def_->output_type(i);
-      } else {
-        output_type = static_cast<DataType>(
-            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                *operator_def_, "T", static_cast<int>(DT_FLOAT)));
-      }
-      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
-          output_str, context->device()->allocator(), output_type)));
-    }
-    if (i < operator_def_->output_shape_size()) {
-      std::vector<index_t>
-          shape_configured(operator_def_->output_shape(i).dims_size());
-      for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-        shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
-      }
-      ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
-    }
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
-// op registry
 namespace {
 class OpKeyBuilder {
 public:
@@ -203,51 +65,7 @@ const std::string OpKeyBuilder::Build() {
 }
 }  // namespace
-OpRegistrationInfo::OpRegistrationInfo() {
+MaceStatus OpRegistry::Register(
-  // default device type placer
-  device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
-    MACE_UNUSED(context);
-    return this->devices;
-  };
-  // default input and output memory type setter
-  memory_type_setter = [](OpConditionContext *context) -> void {
-    if (context->device()->device_type() == DeviceType::GPU) {
-#ifdef MACE_ENABLE_OPENCL
-      if (context->device()->gpu_runtime()->UseImageMemory()) {
-        context->set_output_mem_type(MemoryType::GPU_IMAGE);
-      } else {
-        context->set_output_mem_type(MemoryType::GPU_BUFFER);
-      }
-#endif  // MACE_ENABLE_OPENCL
-    } else {
-      context->set_output_mem_type(MemoryType::CPU_BUFFER);
-    }
-  };
-  data_format_selector = [](OpConditionContext *context)
-      -> std::vector<DataFormat> {
-    DataFormat op_data_format =
-        static_cast<DataFormat>(
-            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                *context->operator_def(), "data_format",
-                static_cast<int>(DataFormat::NONE)));
-    return std::vector<DataFormat>(context->operator_def()->input_size(),
-                                   op_data_format);
-  };
-}
-void OpRegistrationInfo::AddDevice(DeviceType device) {
-  devices.insert(device);
-}
-void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
-  VLOG(3) << "Registering: " << key;
-  MACE_CHECK(creators.count(key) == 0, "Key already registered: ", key);
-  creators[key] = creator;
-}
-MaceStatus OpRegistryBase::Register(
    const std::string &op_type,
    const DeviceType device_type,
    const DataType dt,
@@ -266,7 +84,7 @@ MaceStatus OpRegistryBase::Register(
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus OpRegistryBase::Register(
+MaceStatus OpRegistry::Register(
    const OpConditionBuilder &builder) {
  std::string op_type = builder.type();
  if (registry_.count(op_type) == 0) {
@@ -277,7 +95,7 @@ MaceStatus OpRegistryBase::Register(
  return MaceStatus::MACE_SUCCESS;
 }
-const std::set<DeviceType> OpRegistryBase::AvailableDevices(
+const std::set<DeviceType> OpRegistry::AvailableDevices(
    const std::string &op_type, OpConditionContext *context) const {
  MACE_CHECK(registry_.count(op_type) != 0,
             op_type, " operation is not registered.");
@@ -285,7 +103,7 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(
  return registry_.at(op_type)->device_placer(context);
 }
-void OpRegistryBase::GetInOutMemoryTypes(
+void OpRegistry::GetInOutMemoryTypes(
    const std::string &op_type,
    OpConditionContext *context) const {
  MACE_CHECK(registry_.count(op_type) != 0,
@@ -293,7 +111,7 @@ void OpRegistryBase::GetInOutMemoryTypes(
  return registry_.at(op_type)->memory_type_setter(context);
 }
-const std::vector<DataFormat> OpRegistryBase::InputsDataFormat(
+const std::vector<DataFormat> OpRegistry::InputsDataFormat(
    const std::string &op_type,
    OpConditionContext *context) const {
  MACE_CHECK(registry_.count(op_type) != 0,
@@ -301,7 +119,7 @@ const std::vector<DataFormat> OpRegistryBase::InputsDataFormat(
  return registry_.at(op_type)->data_format_selector(context);
 }
-std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
+std::unique_ptr<Operation> OpRegistry::CreateOperation(
    OpConstructContext *context,
    DeviceType device_type) const {
  auto operator_def = context->operator_def();
@@ -328,44 +146,4 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
  return registry_.at(op_type)->creators.at(key)(context);
 }
-OpConditionBuilder::OpConditionBuilder(const std::string &type)
-    : type_(type) {}
-const std::string OpConditionBuilder::type() const {
-  return type_;
-}
-OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
-    OpRegistrationInfo::DevicePlacer placer) {
-  placer_ = placer;
-  return *this;
-}
-OpConditionBuilder &OpConditionBuilder::SetInputMemoryTypeSetter(
-    OpRegistrationInfo::MemoryTypeSetter setter) {
-  memory_type_setter_ = setter;
-  return *this;
-}
-OpConditionBuilder &OpConditionBuilder::SetInputsDataFormatSelector(
-    OpRegistrationInfo::DataFormatSelector selector) {
-  data_format_selector_ = selector;
-  return *this;
-}
-void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
-  if (info != nullptr) {
-    if (placer_) {
-      info->device_placer = placer_;
-    }
-    if (memory_type_setter_) {
-      info->memory_type_setter = memory_type_setter_;
-    }
-    if (data_format_selector_) {
-      info->data_format_selector = data_format_selector_;
-    }
-  }
-}
 }  // namespace mace
--- a/mace/core/registry/ops_registry.h
+++ b/mace/core/registry/ops_registry.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_REGISTRY_OPS_REGISTRY_H_
+#define MACE_CORE_REGISTRY_OPS_REGISTRY_H_
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "mace/core/ops/operator.h"
+#include "mace/core/ops/op_condition_builder.h"
+#include "mace/core/ops/op_condition_context.h"
+#include "mace/public/mace.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/utils/memory.h"
+namespace mace {
+class OpRegistry {
+ public:
+  OpRegistry() = default;
+  virtual ~OpRegistry() = default;
+  MaceStatus Register(const std::string &op_type,
+                      const DeviceType device_type,
+                      const DataType dt,
+                      OpRegistrationInfo::OpCreator creator);
+  MaceStatus Register(const OpConditionBuilder &builder);
+  const std::set<DeviceType> AvailableDevices(
+      const std::string &op_type, OpConditionContext *context) const;
+  void GetInOutMemoryTypes(
+      const std::string &op_type, OpConditionContext *context) const;
+  const std::vector<DataFormat> InputsDataFormat(
+      const std::string &op_type, OpConditionContext *context) const;
+  std::unique_ptr<Operation> CreateOperation(
+      OpConstructContext *context,
+      DeviceType device_type) const;
+  template<class DerivedType>
+  static std::unique_ptr<Operation> DefaultCreator(
+      OpConstructContext *context) {
+    return make_unique<DerivedType>(context);
+  }
+ private:
+  std::unordered_map<std::string, std::unique_ptr<OpRegistrationInfo>>
+      registry_;
+  MACE_DISABLE_COPY_AND_ASSIGN(OpRegistry);
+};
+#define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
+  op_registry->Register(op_type,                                       \
+                        device,                                        \
+                        DataTypeToEnum<dt>::value,                     \
+                        OpRegistry::DefaultCreator<class_name<device, dt>>)
+#define MACE_REGISTER_OP_BY_CLASS(\
+    op_registry, op_type, class_name, device, dt)  \
+  op_registry->Register(op_type,                   \
+                        device,                    \
+                        DataTypeToEnum<dt>::value, \
+                        OpRegistry::DefaultCreator<class_name>)
+#ifdef MACE_ENABLE_OPENCL
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
+  op_registry->Register(                                       \
+      op_type,                                                 \
+      DeviceType::GPU,                                         \
+      DT_FLOAT,                                                \
+      OpRegistry::DefaultCreator<class_name<DeviceType::GPU, float>>)
+#else
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
+#endif
+#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
+  op_registry->Register(builder)
+}  // namespace mace
+#endif  // MACE_CORE_REGISTRY_OPS_REGISTRY_H_
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -46,7 +46,7 @@ bool HasHalfTensor(const NetDef &net_def) {
  return false;
 }
-template <typename T>
+template<typename T>
 void DequantizeTensor(Device *device,
                      const unsigned char *model_data,
                      const ConstTensor &const_tensor,
@@ -66,7 +66,8 @@ void DequantizeTensor(Device *device,
 }  // namespace
-Workspace::Workspace() = default;
+Workspace::Workspace(const OpDelegatorRegistry *registry) :
+    op_delegator_registry_(registry) {}
 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
@@ -144,7 +145,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        DataType dst_data_type = const_tensor.data_type();
        if (device_type == DeviceType::CPU &&
-             const_tensor.data_type() == DataType::DT_HALF) {
+            const_tensor.data_type() == DataType::DT_HALF) {
          dst_data_type = DataType::DT_FLOAT;
        } else if (!is_quantize_model && const_tensor.quantized()) {
          if (device_type == GPU && net_def.data_type() != DataType::DT_FLOAT) {
@@ -173,13 +174,13 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        if (device_type == DeviceType::CPU &&
            const_tensor.data_type() == DataType::DT_HALF) {
-            // uncompress the weights of fp16
+          // uncompress the weights of fp16
-            auto org_data = reinterpret_cast<const half *>(
+          auto org_data = reinterpret_cast<const half *>(
-                model_data + const_tensor.offset());
+              model_data + const_tensor.offset());
-            float *dst_data = tensor->mutable_data<float>();
+          float *dst_data = tensor->mutable_data<float>();
-            for (int i = 0; i < const_tensor.data_size(); ++i) {
+          for (int i = 0; i < const_tensor.data_size(); ++i) {
-              dst_data[i] = half_float::half_cast<float>(org_data[i]);
+            dst_data[i] = half_float::half_cast<float>(org_data[i]);
-            }
+          }
        } else if (!is_quantize_model && const_tensor.quantized()) {
          // uncompress the weights of uint8
          if (dst_data_type != DT_FLOAT) {
@@ -401,4 +402,8 @@ void Workspace::RemoveTensor(const std::string &name) {
  }
 }
+const OpDelegatorRegistry *Workspace::GetDelegatorRegistry() const {
+  return op_delegator_registry_;
+}
 }  // namespace mace
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -27,13 +27,14 @@
 namespace mace {
+class OpDelegatorRegistry;
 class MemoryOptimizer;
 class Workspace {
 public:
  typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
-  Workspace();
+  explicit Workspace(const OpDelegatorRegistry *registry);
  ~Workspace() {}
  Tensor *CreateTensor(const std::string &name,
@@ -71,15 +72,16 @@ class Workspace {
  void RemoveTensor(const std::string &name);
+  const OpDelegatorRegistry *GetDelegatorRegistry() const;
 private:
  TensorMap tensor_map_;
  std::unique_ptr<BufferBase> tensor_buffer_;
  PreallocatedPooledAllocator preallocated_allocator_;
  bool diffused_buffer_;
+  const OpDelegatorRegistry *op_delegator_registry_;
  MACE_DISABLE_COPY_AND_ASSIGN(Workspace);
 };

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -19,8 +19,10 @@
 #include "mace/core/device_context.h"
 #include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
-#include "mace/ops/registry/ops_registry.h"
+#include "mace/core/registry/ops_registry.h"
+#include "mace/core/registry/op_delegator_registry.h"
 #include "mace/ops/common/transpose.h"
+#include "mace/ops/registry/registry.h"
 #include "mace/utils/math.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/stl_util.h"
@@ -451,7 +453,8 @@ class MaceEngine::Impl {
 private:
  std::unique_ptr<port::ReadOnlyMemoryRegion> model_data_;
-  std::unique_ptr<OpRegistryBase> op_registry_;
+  std::unique_ptr<OpRegistry> op_registry_;
+  std::unique_ptr<OpDelegatorRegistry> op_delegator_registry_;
  DeviceType device_type_;
  std::unique_ptr<Device> device_;
  std::unique_ptr<Workspace> ws_;
@@ -478,9 +481,10 @@ class MaceEngine::Impl {
 MaceEngine::Impl::Impl(const MaceEngineConfig &config)
    : model_data_(nullptr),
      op_registry_(new OpRegistry),
+      op_delegator_registry_(new OpDelegatorRegistry),
      device_type_(config.impl_->device_type()),
      device_(nullptr),
-      ws_(new Workspace()),
+      ws_(new Workspace(op_delegator_registry_.get())),
      net_(nullptr),
      is_quantized_model_(false),
      thread_pool_(new utils::ThreadPool(config.impl_->num_threads(),
@@ -498,6 +502,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
 #endif
 {
  LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
+  ops::RegisterAllOps(op_registry_.get());
+  ops::RegisterAllOpDelegators(op_delegator_registry_.get());
  thread_pool_->Init();
  if (device_type_ == DeviceType::CPU) {
    device_.reset(new CPUDevice(config.impl_->num_threads(),

--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -22,11 +22,13 @@ cc_library(
    srcs = glob(
        [
            "common/*.cc",
+            "delegator/*.cc",
        ],
    ),
    hdrs = glob(
        [
            "common/*.h",
+            "delegator/*.h",
        ],
    ),
    copts = [
@@ -58,12 +60,16 @@ cc_library(
        [
            "ref/*.cc",
        ],
-    ),
+    ) + if_quantize_enabled(glob([
+        "ref/q8/*.cc",
+    ])),
    hdrs = glob(
        [
            "ref/*.h",
        ],
-    ),
+    ) + if_quantize_enabled(glob([
+        "ref/q8/*.h",
+    ])),
    copts = [
        "-Werror",
        "-Wextra",
@@ -236,12 +242,12 @@ cc_library(
 cc_library(
    name = "ops",
-    srcs = [
+    srcs = glob([
-        "registry/ops_registry.cc",
+        "registry/*.cc",
-    ],
+    ]),
-    hdrs = [
+    hdrs = glob([
-        "registry/ops_registry.h",
+        "registry/*.h",
-    ],
+    ]),
    copts = [
        "-Werror",
        "-Wextra",

--- a/mace/ops/CMakeLists.txt
+++ b/mace/ops/CMakeLists.txt
 file(GLOB OPS_COMMON_SRCS common/*.cc)
 file(GLOB OPS_REF_KERNELS_SRCS ref/*.cc)
+file(GLOB OPS_REF_Q8_KERNELS_SRCS
+  ref/q8/*.cc
+)
 file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS
  arm/fp32/*.cc
 )
@@ -17,20 +21,23 @@ file(GLOB OPS_OPENCL_KERNELS_SRCS
 file(GLOB OPS_INTERNAL_OPS_SRCS *.cc)
-set(OPS_SRCS registry/ops_registry.cc)
+set(OPS_SRCS registry/ops_registry.cc registry/op_delegators_registry.cc)
 set(OPS_SRCS ${OPS_SRCS} ${OPS_COMMON_SRCS})
 set(OPS_SRCS ${OPS_SRCS} ${OPS_INTERNAL_OPS_SRCS})
 # TODO we need to remove this in production build
 set(OPS_SRCS ${OPS_SRCS} ${OPS_REF_KERNELS_SRCS}) 
+if(MACE_ENABLE_QUANTIZE)
+  set(OPS_SRCS ${OPS_SRCS} ${OPS_REF_Q8_KERNELS_SRCS})
+endif(MACE_ENABLE_QUANTIZE)
 if(MACE_ENABLE_NEON)
  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
+  if(MACE_ENABLE_QUANTIZE)
+    set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
+  endif(MACE_ENABLE_QUANTIZE)
 endif(MACE_ENABLE_NEON)
-if(MACE_ENABLE_QUANTIZE)
-  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
-endif(MACE_ENABLE_QUANTIZE)
 if(MACE_ENABLE_OPENCL)
  set(OPS_SRCS ${OPS_SRCS} ${OPS_OPENCL_KERNELS_SRCS})
 endif(MACE_ENABLE_OPENCL)

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -17,13 +17,10 @@
 #include <memory>
 #include <set>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
-#if defined(MACE_ENABLE_NEON)
+#include "mace/ops/delegator/activation.h"
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/activation.h"
-#endif
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -37,19 +34,20 @@ namespace ops {
 template<DeviceType D, class T>
 class ActivationOp;
-template<>
+template<typename T>
-class ActivationOp<DeviceType::CPU, float> : public Operation {
+class ActivationOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit ActivationOp(OpConstructContext *context)
      : Operation(context),
        activation_type_(ops::StringToActivationType(
-            Operation::GetOptionalArg<std::string>("activation",
+            Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
-                                                   "NOOP"))),
+        activation_delegator_(delegator::Activation::Create(
-        activation_delegator_(activation_type_,
+            context->workspace(),
-                              Operation::GetOptionalArg<float>("max_limit",
+            MACE_DELEGATOR_KEY(Activation, CPU, T, MACE_CPU_IMPL_TYPE),
-                                                               0.0f),
+            delegator::ActivationParam(
-                              Operation::GetOptionalArg<float>(
+                activation_type_,
-                                  "leakyrelu_coefficient", 0.0f)) {}
+                Operation::GetOptionalArg<T>("max_limit", 0),
+                Operation::GetOptionalArg<T>("leakyrelu_coefficient", 0)))) {}
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -58,28 +56,24 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
    if (activation_type_ == PRELU) {
      MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-      const float *input_ptr = input->data<float>();
+      const T *input_ptr = input->data<T>();
-      float *output_ptr = output->mutable_data<float>();
+      T *output_ptr = output->mutable_data<T>();
      MACE_CHECK(this->InputSize() > 1);
      const Tensor *alpha = this->Input(1);
-      const float *alpha_ptr = alpha->data<float>();
+      const T *alpha_ptr = alpha->data<T>();
      const index_t outer_size = output->dim(0);
      const index_t inner_size = output->dim(2) * output->dim(3);
      PReLUActivation(context, input_ptr, outer_size, input->dim(1), inner_size,
                      alpha_ptr, output_ptr);
    } else {
-      activation_delegator_.Compute(context, input, output);
+      activation_delegator_->Compute(context, input, output);
    }
    return MaceStatus::MACE_SUCCESS;
  }
 private:
  ActivationType activation_type_;
-#if defined(MACE_ENABLE_NEON)
+  std::unique_ptr<delegator::Activation> activation_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
 };
 #ifdef MACE_ENABLE_OPENCL
@@ -122,7 +116,7 @@ class ActivationOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterActivation(OpRegistryBase *op_registry) {
+void RegisterActivation(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
                   DeviceType::CPU, float);
  MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -20,7 +20,7 @@
 #include <string>
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/utils/logging.h"

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -19,7 +19,8 @@
 #include <algorithm>
 #include <memory>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/addn.h"
@@ -92,7 +93,7 @@ class AddNOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterAddN(OpRegistryBase *op_registry) {
+void RegisterAddN(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
  MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
  MACE_REGISTER_OP_CONDITION(

--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
@@ -18,7 +18,8 @@
 #include <memory>
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 namespace mace {
 namespace ops {
@@ -109,7 +110,7 @@ class ArgMaxOp : public Operation {
-void RegisterArgMax(OpRegistryBase *op_registry) {
+void RegisterArgMax(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp,
                   DeviceType::CPU, float);
 }

--- a/mace/ops/arm/fp32/activation.cc
+++ b/mace/ops/arm/fp32/activation.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/activation.h"
+#include "mace/ops/delegator/activation.h"
 #include <arm_neon.h>
 #include <algorithm>
@@ -22,16 +22,22 @@ namespace ops {
 namespace arm {
 namespace fp32 {
-Activation::Activation(ActivationType type,
+class Activation : public delegator::Activation {
-                       const float limit,
+ public:
-                       const float leakyrelu_coefficient)
+  explicit Activation(const delegator::ActivationParam &param)
-    : type_(type),
+      : delegator::Activation(param) {}
-      limit_(limit),
+  ~Activation() = default;
-      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input, Tensor *output) override;
+ private:
+  void DoActivation(const OpContext *context,
+                    const Tensor *input, Tensor *output);
+};
 MaceStatus Activation::Compute(const OpContext *context,
-                               const Tensor *input,
+                               const Tensor *input, Tensor *output) {
-                               Tensor *output) {
  Tensor::MappingGuard input_guard(input);
  if (input != output) {
    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
@@ -139,7 +145,7 @@ void Activation::DoActivation(const OpContext *context,
      // remain
      for (index_t i = block_count * 4; i < size; ++i) {
        output_data[i] = std::max(input_data[i], 0.f) +
-                         std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
+            std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
      }
      break;
@@ -169,14 +175,19 @@ void Activation::DoActivation(const OpContext *context,
      break;
    }
-    case NOOP:
+    case NOOP: {
      break;
+    }
-    default:
+    default: {
      MACE_NOT_IMPLEMENTED;
+    }
  }
 }
+MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam,
+                        MACE_DELEGATOR_KEY(Activation, CPU, float, NEON))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/bias_add.cc
+++ b/mace/ops/arm/fp32/bias_add.cc
@@ -12,15 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/bias_add.h"
 #include <arm_neon.h>
+#include "mace/ops/delegator/bias_add.h"
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
+class BiasAdd : public delegator::BiasAdd {
+ public:
+  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
+  ~BiasAdd() = default;
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *bias, Tensor *output) override;
+ private:
+  void AddBias(const OpContext *context, const Tensor *input,
+               const Tensor *bias, Tensor *output);
+};
 MaceStatus BiasAdd::Compute(const OpContext *context,
                            const Tensor *input,
                            const Tensor *bias,
@@ -117,6 +129,9 @@ void BiasAdd::AddBias(const OpContext *context,
  }
 }
+MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam,
+                        MACE_DELEGATOR_KEY(BiasAdd, CPU, float, NEON))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
@@ -18,36 +18,25 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
-class Conv2dBase {
+class Conv2dBase : public delegator::Conv2d {
 public:
-  Conv2dBase(const std::vector<int> &strides,
+  explicit Conv2dBase(const delegator::Conv2dParam &param)
-             const std::vector<int> &dilations,
+      : delegator::Conv2d(param) {}
-             const std::vector<int> &paddings,
-             const Padding padding_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type) {}
  virtual ~Conv2dBase() = default;
-  virtual MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) = 0;
 protected:
  void CalOutputShapeAndInputPadSize(const std::vector<index_t> &input_shape,
                                     const std::vector<index_t> &filter_shape,
@@ -83,11 +72,6 @@ class Conv2dBase {
                const int pad_left,
                Tensor *dst);
  void UnPadOutput(const Tensor &src, Tensor *dst);
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
 };
 }  // namespace fp32

--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
@@ -12,13 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d_1x1.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
+class Conv2dK1x1 : public Conv2dBase {
+ public:
+  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param),
+        gemm_(delegator::GemmParam()) {}
+  virtual ~Conv2dK1x1() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+ private:
+  Gemm gemm_;
+};
 MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                               const Tensor *input,
                               const Tensor *filter,
@@ -94,6 +113,9 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                       output);
 }
+MACE_REGISTER_DELEGATOR(registry, Conv2dK1x1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x1))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/conv_2d_1xn.cc
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
@@ -17,6 +17,8 @@
 #include <arm_neon.h>
 #include <memory>
+#include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
@@ -859,6 +861,19 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Conv2dK1x7S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x7S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x1S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x1S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK1x15S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                              NEON, K1x15S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK15x1S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                              NEON, K15x1S1))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
@@ -16,10 +16,11 @@
 #define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
 #include <vector>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -28,8 +29,8 @@ namespace fp32 {
 class Conv2dK1x7S1 : public Conv2dBase {
 public:
-  Conv2dK1x7S1(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK1x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK1x7S1() {}
  MaceStatus Compute(
@@ -41,8 +42,8 @@ class Conv2dK1x7S1 : public Conv2dBase {
 class Conv2dK7x1S1 : public Conv2dBase {
 public:
-  Conv2dK7x1S1(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK7x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK7x1S1() {}
  MaceStatus Compute(
@@ -54,8 +55,8 @@ class Conv2dK7x1S1 : public Conv2dBase {
 class Conv2dK1x15S1 : public Conv2dBase {
 public:
-  Conv2dK1x15S1(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK1x15S1(const delegator::Conv2dParam &param)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK1x15S1() {}
  MaceStatus Compute(
@@ -67,8 +68,8 @@ class Conv2dK1x15S1 : public Conv2dBase {
 class Conv2dK15x1S1 : public Conv2dBase {
 public:
-  Conv2dK15x1S1(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK15x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK15x1S1() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
@@ -17,6 +17,8 @@
 #include <arm_neon.h>
 #include <memory>
+#include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
@@ -735,6 +737,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S2, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S2))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
@@ -16,10 +16,11 @@
 #define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
 #include <vector>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -28,8 +29,8 @@ namespace fp32 {
 class Conv2dK3x3S1 : public Conv2dBase {
 public:
-  Conv2dK3x3S1(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK3x3S1(const delegator::Conv2dParam &param)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK3x3S1() {}
  MaceStatus Compute(
@@ -41,8 +42,8 @@ class Conv2dK3x3S1 : public Conv2dBase {
 class Conv2dK3x3S2 : public Conv2dBase {
 public:
-  Conv2dK3x3S2(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK3x3S2(const delegator::Conv2dParam &param)
-      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK3x3S2() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
@@ -800,6 +801,10 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
  }, 0, batch, 1, 0, out_channels, 1);
 }
+MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(
+                            Conv2d, CPU, float, NEON, K3x3Winograd))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -18,11 +18,11 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -31,10 +31,9 @@ namespace fp32 {
 class Conv2dK3x3Winograd : public Conv2dBase {
 public:
-  Conv2dK3x3Winograd(const std::vector<int> &paddings,
+  explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param)
-                     const Padding padding_type)
+      : Conv2dBase(param),
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type),
+        gemm_(delegator::GemmParam()),
-        gemm_(),
        transformed_filter_(nullptr),
        out_tile_size_(0) {}

--- a/mace/ops/arm/fp32/conv_2d_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -12,16 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d_5x5.h"
 #include <arm_neon.h>
 #include <memory>
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
+class Conv2dK5x5S1 : public Conv2dBase {
+ public:
+  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
+  virtual ~Conv2dK5x5S1() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+};
 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
  /* load filter (4 outch x 1 height x 4 width) */        \
  float32x4_t vf00, vf10, vf20, vf30;                     \
@@ -244,6 +258,9 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Conv2dK5x5S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K5x5S1))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/conv_2d_5x5.h
+++ b/mace/ops/arm/fp32/conv_2d_5x5.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
-#include <vector>
-#include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-class Conv2dK5x5S1 : public Conv2dBase {
- public:
-  Conv2dK5x5S1(const std::vector<int> &paddings, const Padding padding_type)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
-  virtual ~Conv2dK5x5S1() {}
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
--- a/mace/ops/arm/fp32/conv_2d_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
@@ -17,6 +17,8 @@
 #include <arm_neon.h>
 #include <memory>
+#include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
@@ -720,6 +722,13 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S1, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S1))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S2, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S2))
+MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S3, delegator::Conv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S3))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
@@ -16,10 +16,11 @@
 #define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
 #include <vector>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -28,8 +29,8 @@ namespace fp32 {
 class Conv2dK7x7S1 : public Conv2dBase {
 public:
-  Conv2dK7x7S1(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK7x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK7x7S1() {}
  MaceStatus Compute(
@@ -41,8 +42,8 @@ class Conv2dK7x7S1 : public Conv2dBase {
 class Conv2dK7x7S2 : public Conv2dBase {
 public:
-  Conv2dK7x7S2(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK7x7S2(const delegator::Conv2dParam &param)
-      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK7x7S2() {}
  MaceStatus Compute(
@@ -54,8 +55,8 @@ class Conv2dK7x7S2 : public Conv2dBase {
 class Conv2dK7x7S3 : public Conv2dBase {
 public:
-  Conv2dK7x7S3(const std::vector<int> &paddings, const Padding padding_type)
+  explicit Conv2dK7x7S3(const delegator::Conv2dParam &param)
-      : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {}
+      : Conv2dBase(param) {}
  virtual ~Conv2dK7x7S3() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_general.cc
@@ -12,15 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_general.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
 #include <memory>
+#include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  explicit Conv2dGeneral(const delegator::Conv2dParam &param)
+      : Conv2dBase(param) {}
+  virtual ~Conv2dGeneral() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+};
 MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                                  const Tensor *input,
                                  const Tensor *filter,
@@ -237,6 +252,10 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(
+    registry, Conv2dGeneral, delegator::Conv2dParam,
+    MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, General))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/deconv_2d.h
+++ b/mace/ops/arm/fp32/deconv_2d.h
@@ -18,54 +18,27 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/deconv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
-class Deconv2dBase {
+class Deconv2dBase : public delegator::Deconv2d {
 public:
-  Deconv2dBase(const std::vector<int> &strides,
+  explicit Deconv2dBase(const delegator::Deconv2dParam &param)
-               const std::vector<int> &dilations,
+      : delegator::Deconv2d(param),
-               const std::vector<int> &paddings,
+        group_(param.group_) {}
-               const Padding padding_type,
-               const index_t group,
-               const FrameworkType framework_type)
-      : strides_(strides),
-        dilations_(dilations),
-        paddings_(paddings),
-        padding_type_(padding_type),
-        group_(group),
-        framework_type_(framework_type) {}
-  Deconv2dBase(const std::vector<int> &strides,
-               const std::vector<int> &dilations,
-               const std::vector<int> &paddings,
-               const Padding padding_type,
-               const FrameworkType framework_type)
-      : Deconv2dBase(strides,
-                     dilations,
-                     paddings,
-                     padding_type,
-                     1,
-                     framework_type) {}
  virtual ~Deconv2dBase() = default;
-  virtual MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) = 0;
 protected:
  MaceStatus ResizeOutAndPadOut(const OpContext *context,
                                const Tensor *input,
@@ -78,13 +51,7 @@ class Deconv2dBase {
  void UnPadOutput(const Tensor &src,
                   const std::vector<int> &out_pad_size,
                   Tensor *dst);
-  const std::vector<int> strides_;
-  const std::vector<int> dilations_;
-  const std::vector<int> paddings_;
-  const Padding padding_type_;
  index_t group_;
-  const FrameworkType framework_type_;
 };
 }  // namespace fp32

--- a/mace/ops/arm/fp32/deconv_2d_2x2.cc
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc
@@ -330,12 +330,18 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
    }
  }, 0, batch, 1, 0, outch, 1);
  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K2x2S1))
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K2x2S2))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/deconv_2d_2x2.h
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.h
@@ -18,12 +18,12 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -32,10 +32,8 @@ namespace fp32 {
 class Deconv2dK2x2S1 : public Deconv2dBase {
 public:
-  Deconv2dK2x2S1(const std::vector<int> &paddings,
+  explicit Deconv2dK2x2S1(const delegator::Deconv2dParam &param)
-                 const Padding padding_type,
+      : Deconv2dBase(param) {}
-                 const FrameworkType framework_type)
-      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
  virtual ~Deconv2dK2x2S1() {}
  MaceStatus Compute(
@@ -48,10 +46,8 @@ class Deconv2dK2x2S1 : public Deconv2dBase {
 class Deconv2dK2x2S2 : public Deconv2dBase {
 public:
-  Deconv2dK2x2S2(const std::vector<int> &paddings,
+  explicit Deconv2dK2x2S2(const delegator::Deconv2dParam &param)
-                 const Padding padding_type,
+      : Deconv2dBase(param) {}
-                 const FrameworkType framework_type)
-      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
  virtual ~Deconv2dK2x2S2() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc
@@ -464,6 +464,13 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K3x3S2))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.h
@@ -18,12 +18,12 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -32,10 +32,8 @@ namespace fp32 {
 class Deconv2dK3x3S1 : public Deconv2dBase {
 public:
-  Deconv2dK3x3S1(const std::vector<int> &paddings,
+  explicit Deconv2dK3x3S1(const delegator::Deconv2dParam &param)
-                 const Padding padding_type,
+      : Deconv2dBase(param) {}
-                 const FrameworkType framework_type)
-      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
  virtual ~Deconv2dK3x3S1() {}
  MaceStatus Compute(
@@ -48,10 +46,8 @@ class Deconv2dK3x3S1 : public Deconv2dBase {
 class Deconv2dK3x3S2 : public Deconv2dBase {
 public:
-  Deconv2dK3x3S2(const std::vector<int> &paddings,
+  explicit Deconv2dK3x3S2(const delegator::Deconv2dParam &param)
-                 const Padding padding_type,
+      : Deconv2dBase(param) {}
-                 const FrameworkType framework_type)
-      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
  virtual ~Deconv2dK3x3S2() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc
@@ -449,7 +449,6 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;
  utils::ThreadPool
      &thread_pool = context->device()->cpu_runtime()->thread_pool();
@@ -575,6 +574,13 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K4x4S1))
+MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, K4x4S2))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.h
@@ -18,12 +18,12 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -32,10 +32,8 @@ namespace fp32 {
 class Deconv2dK4x4S1 : public Deconv2dBase {
 public:
-  Deconv2dK4x4S1(const std::vector<int> &paddings,
+  explicit Deconv2dK4x4S1(const delegator::Deconv2dParam &param)
-                 const Padding padding_type,
+      : Deconv2dBase(param) {}
-                 const FrameworkType framework_type)
-      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
  virtual ~Deconv2dK4x4S1() {}
  MaceStatus Compute(
@@ -48,10 +46,8 @@ class Deconv2dK4x4S1 : public Deconv2dBase {
 class Deconv2dK4x4S2 : public Deconv2dBase {
 public:
-  Deconv2dK4x4S2(const std::vector<int> &paddings,
+  explicit Deconv2dK4x4S2(const delegator::Deconv2dParam &param)
-                 const Padding padding_type,
+      : Deconv2dBase(param) {}
-                 const FrameworkType framework_type)
-      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
  virtual ~Deconv2dK4x4S2() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/deconv_2d_general.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/deconv_2d_general.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
 // TODO(liutuo): optimize it
@@ -21,6 +21,20 @@ namespace ops {
 namespace arm {
 namespace fp32 {
+class Deconv2dGeneral : public Deconv2dBase {
+ public:
+  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param) {}
+  virtual ~Deconv2dGeneral() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
 MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
                                    const Tensor *input,
                                    const Tensor *filter,
@@ -110,6 +124,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Deconv2dGeneral, delegator::Deconv2dParam,
+                        MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
+                                              NEON, General))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
@@ -512,6 +512,13 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S2))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
@@ -16,10 +16,12 @@
 #define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
 #include <vector>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/delegator/depthwise_conv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -28,9 +30,8 @@ namespace fp32 {
 class DepthwiseConv2dK3x3S1 : public Conv2dBase {
 public:
-  DepthwiseConv2dK3x3S1(const std::vector<int> &paddings,
+  explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam &param)
-                        const Padding padding_type)
+      : Conv2dBase(param) {}
-      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
  virtual ~DepthwiseConv2dK3x3S1() {}
  MaceStatus Compute(
@@ -42,9 +43,8 @@ class DepthwiseConv2dK3x3S1 : public Conv2dBase {
 class DepthwiseConv2dK3x3S2 : public Conv2dBase {
 public:
-  DepthwiseConv2dK3x3S2(const std::vector<int> &paddings,
+  explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam &param)
-                        const Padding padding_type)
+      : Conv2dBase(param) {}
-      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
  virtual ~DepthwiseConv2dK3x3S2() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
@@ -776,6 +776,20 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S2))
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S1))
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S2))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
@@ -18,12 +18,13 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -32,14 +33,9 @@ namespace fp32 {
 class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
 public:
-  DepthwiseDeconv2dK3x3S1(const std::vector<int> &paddings,
+  explicit DepthwiseDeconv2dK3x3S1(
-                          const Padding padding_type,
+      const delegator::DepthwiseDeconv2dParam &param)
-                          const FrameworkType framework_type)
+      : Deconv2dBase(param) {}
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
  virtual ~DepthwiseDeconv2dK3x3S1() {}
  MaceStatus Compute(
@@ -52,14 +48,9 @@ class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
 class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
 public:
-  DepthwiseDeconv2dK3x3S2(const std::vector<int> &paddings,
+  explicit DepthwiseDeconv2dK3x3S2(
-                          const Padding padding_type,
+      const delegator::DepthwiseDeconv2dParam &param)
-                          const FrameworkType framework_type)
+      : Deconv2dBase(param) {}
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
  virtual ~DepthwiseDeconv2dK3x3S2() {}
  MaceStatus Compute(
@@ -72,16 +63,9 @@ class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
 class GroupDeconv2dK3x3S1 : public Deconv2dBase {
 public:
-  GroupDeconv2dK3x3S1(const std::vector<int> &paddings,
+  explicit GroupDeconv2dK3x3S1(
-                      const Padding padding_type,
+      const delegator::GroupDeconv2dParam &param)
-                      const int group,
+      : Deconv2dBase(param) {}
-                      const FrameworkType framework_type)
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
  virtual ~GroupDeconv2dK3x3S1() {}
  MaceStatus Compute(
@@ -94,16 +78,8 @@ class GroupDeconv2dK3x3S1 : public Deconv2dBase {
 class GroupDeconv2dK3x3S2 : public Deconv2dBase {
 public:
-  GroupDeconv2dK3x3S2(const std::vector<int> &paddings,
+  explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam &param)
-                      const Padding padding_type,
+      : Deconv2dBase(param) {}
-                      const int group,
-                      const FrameworkType framework_type)
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
  virtual ~GroupDeconv2dK3x3S2() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
@@ -959,6 +959,20 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S1))
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S2))
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S1))
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S2))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
@@ -18,12 +18,13 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -32,14 +33,9 @@ namespace fp32 {
 class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
 public:
-  DepthwiseDeconv2dK4x4S1(const std::vector<int> &paddings,
+  explicit DepthwiseDeconv2dK4x4S1(
-                          const Padding padding_type,
+      const delegator::DepthwiseDeconv2dParam &param)
-                          const FrameworkType framework_type)
+      : Deconv2dBase(param) {}
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
  virtual ~DepthwiseDeconv2dK4x4S1() {}
  MaceStatus Compute(
@@ -52,14 +48,9 @@ class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
 class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
 public:
-  DepthwiseDeconv2dK4x4S2(const std::vector<int> &paddings,
+  explicit DepthwiseDeconv2dK4x4S2(
-                          const Padding padding_type,
+      const delegator::DepthwiseDeconv2dParam &param)
-                          const FrameworkType framework_type)
+      : Deconv2dBase(param) {}
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     framework_type) {}
  virtual ~DepthwiseDeconv2dK4x4S2() {}
  MaceStatus Compute(
@@ -72,16 +63,8 @@ class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
 class GroupDeconv2dK4x4S1 : public Deconv2dBase {
 public:
-  GroupDeconv2dK4x4S1(const std::vector<int> &paddings,
+  explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam &param)
-                      const Padding padding_type,
+      : Deconv2dBase(param) {}
-                      const int group,
-                      const FrameworkType framework_type)
-      : Deconv2dBase({1, 1},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
  virtual ~GroupDeconv2dK4x4S1() {}
  MaceStatus Compute(
@@ -94,16 +77,8 @@ class GroupDeconv2dK4x4S1 : public Deconv2dBase {
 class GroupDeconv2dK4x4S2 : public Deconv2dBase {
 public:
-  GroupDeconv2dK4x4S2(const std::vector<int> &paddings,
+  explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam &param)
-                      const Padding padding_type,
+      : Deconv2dBase(param) {}
-                      const int group,
-                      const FrameworkType framework_type)
-      : Deconv2dBase({2, 2},
-                     {1, 1},
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
  virtual ~GroupDeconv2dK4x4S2() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
@@ -207,6 +207,14 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(
+    registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, General))
+MACE_REGISTER_DELEGATOR(
+    registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
+    MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, General))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
@@ -18,12 +18,13 @@
 #include <vector>
 #include <memory>
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
@@ -32,16 +33,9 @@ namespace fp32 {
 class DepthwiseDeconv2dGeneral : public Deconv2dBase {
 public:
-  DepthwiseDeconv2dGeneral(const std::vector<int> &strides,
+  explicit DepthwiseDeconv2dGeneral(
-                           const std::vector<int> &dilations,
+      const delegator::DepthwiseDeconv2dParam &param)
-                           const std::vector<int> &paddings,
+      : Deconv2dBase(param) {}
-                           const Padding padding_type,
-                           const FrameworkType framework_type)
-      : Deconv2dBase(strides,
-                     dilations,
-                     paddings,
-                     padding_type,
-                     framework_type) {}
  virtual ~DepthwiseDeconv2dGeneral() {}
  MaceStatus Compute(
@@ -54,18 +48,8 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase {
 class GroupDeconv2dGeneral : public Deconv2dBase {
 public:
-  GroupDeconv2dGeneral(const std::vector<int> &strides,
+  explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam &param)
-                       const std::vector<int> &dilations,
+      : Deconv2dBase(param) {}
-                       const std::vector<int> &paddings,
-                       const Padding padding_type,
-                       const int group,
-                       const FrameworkType framework_type)
-      : Deconv2dBase(strides,
-                     dilations,
-                     paddings,
-                     padding_type,
-                     group,
-                     framework_type) {}
  virtual ~GroupDeconv2dGeneral() {}
  MaceStatus Compute(

--- a/mace/ops/arm/fp32/gemm.cc
+++ b/mace/ops/arm/fp32/gemm.cc
@@ -1224,6 +1224,9 @@ MaceStatus Gemm::Compute(const OpContext *context,
                 output);
 }
+MACE_REGISTER_DELEGATOR(registry, Gemm, delegator::GemmParam,
+                        MACE_DELEGATOR_KEY(Gemm, CPU, float, NEON))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
@@ -15,10 +15,11 @@
 #ifndef MACE_OPS_ARM_FP32_GEMM_H_
 #define MACE_OPS_ARM_FP32_GEMM_H_
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 #include "mace/ops/common/matrix.h"
+#include "mace/ops/delegator/gemm.h"
+#include "mace/public/mace.h"
 #include "mace/utils/math.h"
 // This implements matrix-matrix multiplication.
@@ -29,13 +30,12 @@ namespace ops {
 namespace arm {
 namespace fp32 {
-class Gemm {
+class Gemm : public delegator::Gemm {
 public:
-  explicit Gemm(const bool should_cache_pack)
+  explicit Gemm(const delegator::GemmParam &param)
-      : pack_cache_(GetCPUAllocator()),
+      : delegator::Gemm(param), pack_cache_(GetCPUAllocator()),
-        should_cache_pack_(should_cache_pack),
+        should_cache_pack_(param.should_cache_pack_),
        cached_(0) {}
-  Gemm() : Gemm(false) {}
  ~Gemm() {}
  MaceStatus Compute(
@@ -51,7 +51,7 @@ class Gemm {
      const MatrixMajor output_major,
      const bool lhs_batched,
      const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
  // Original matrix before transpose has row-major
  MaceStatus Compute(
@@ -68,7 +68,7 @@ class Gemm {
      const bool transpose_out,
      const bool lhs_batched,
      const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 private:
  void ComputeBlock(const float *packed_lhs_data,

--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
@@ -378,6 +378,10 @@ MaceStatus Gemv::Compute(const OpContext *context,
 #undef vaddvq_f32
 #endif
+MACE_REGISTER_DELEGATOR(registry, Gemv, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, float, NEON))
 }  // namespace fp32
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
@@ -15,18 +15,19 @@
 #ifndef MACE_OPS_ARM_FP32_GEMV_H_
 #define MACE_OPS_ARM_FP32_GEMV_H_
-#include "mace/public/mace.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
+#include "mace/ops/delegator/gemv.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
 namespace fp32 {
-class Gemv {
+class Gemv : public delegator::Gemv {
 public:
-  Gemv() {}
+  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
  ~Gemv() {}
  // Always row-major after transpose
  MaceStatus Compute(
@@ -39,7 +40,7 @@ class Gemv {
      const index_t lhs_width,
      const bool lhs_batched,
      const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 };
 }  // namespace fp32

--- a/mace/ops/arm/q8/eltwise.cc
+++ b/mace/ops/arm/q8/eltwise.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/q8/eltwise.h"
 #include <arm_neon.h>
 #include <algorithm>
 #include "mace/ops/common/gemmlowp_util.h"
+#include "mace/ops/delegator/eltwise.h"
 #include "mace/utils/logging.h"
 namespace mace {
@@ -25,6 +24,16 @@ namespace ops {
 namespace arm {
 namespace q8 {
+class Eltwise : public delegator::Eltwise {
+ public:
+  explicit Eltwise(const delegator::EltwiseParam &param)
+      : delegator::Eltwise(param) {}
+  ~Eltwise() = default;
+  MaceStatus Compute(const OpContext *context, const Tensor *input0,
+                     const Tensor *input1, Tensor *output) override;
+};
 MaceStatus Eltwise::Compute(const OpContext *context,
                            const Tensor *input0,
                            const Tensor *input1,
@@ -144,7 +153,7 @@ MaceStatus Eltwise::Compute(const OpContext *context,
                  gemmlowp::SaturatingRoundingDoublingHighMul(
                      res, output_multiplier),
                  -output_shift) +
-              output->zero_point();
+                  output->zero_point();
          output_ptr[i] = Saturate<uint8_t>(output_val);
        }
      },
@@ -153,6 +162,9 @@ MaceStatus Eltwise::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
+MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam,
+                        MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, NEON))
 }  // namespace q8
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/q8/gemv.cc
+++ b/mace/ops/arm/q8/gemv.cc
@@ -181,6 +181,14 @@ class Gemv<uint8_t>;
 template
 class Gemv<int32_t>;
+typedef Gemv<uint8_t> GemvUint8;
+MACE_REGISTER_DELEGATOR(registry, GemvUint8, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, NEON))
+typedef Gemv<int32_t> GemvInt32;
+MACE_REGISTER_DELEGATOR(registry, GemvInt32, DelegatorParam,
+                        MACE_DELEGATOR_KEY(Gemv, CPU, int32_t, NEON))
 }  // namespace q8
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/q8/gemv.h
+++ b/mace/ops/arm/q8/gemv.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,15 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// This implements matrix-vector multiplication described as
-// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
 #ifndef MACE_OPS_ARM_Q8_GEMV_H_
 #define MACE_OPS_ARM_Q8_GEMV_H_
-#include "mace/public/mace.h"
+#include "mace/ops/delegator/gemv.h"
-#include "mace/core/tensor.h"
-#include "mace/core/op_context.h"
 namespace mace {
 namespace ops {
@@ -28,11 +23,11 @@ namespace arm {
 namespace q8 {
 template<typename OUTPUT_TYPE>
-class Gemv {
+class Gemv : public delegator::Gemv {
 public:
-  Gemv() : is_output_type_uint8_(
+  explicit Gemv(const DelegatorParam &param)
-      DataTypeToEnum<OUTPUT_TYPE>::value == DataType::DT_UINT8) {
+      : delegator::Gemv(param), is_output_type_uint8_(
-  }
+      DataTypeToEnum<OUTPUT_TYPE>::value == DataType::DT_UINT8) {}
  ~Gemv() {}
  // Always row-major after transpose
  MaceStatus Compute(
@@ -45,7 +40,7 @@ class Gemv {
      const index_t lhs_width,
      const bool lhs_batched,
      const bool rhs_batched,
-      Tensor *output);
+      Tensor *output) override;
 private:
  bool is_output_type_uint8_;

--- a/mace/ops/arm/q8/quantize.cc
+++ b/mace/ops/arm/q8/quantize.cc
@@ -17,7 +17,8 @@
 #include <algorithm>
 #include <limits>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/quantize.h"
@@ -106,12 +107,12 @@ class DequantizeOp<DeviceType::CPU, T> : public Operation {
  QuantizeUtil<float, T> quantize_util_;
 };
-void RegisterQuantize(OpRegistryBase *op_registry) {
+void RegisterQuantize(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Quantize", QuantizeOp,
                   DeviceType::CPU, uint8_t);
 }
-void RegisterDequantize(OpRegistryBase *op_registry) {
+void RegisterDequantize(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Dequantize", DequantizeOp,
                   DeviceType::CPU, uint8_t);
  MACE_REGISTER_OP(op_registry, "Dequantize", DequantizeOp,

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -16,14 +16,10 @@
 #include <string>
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/activation.h"
+#include "mace/ops/delegator/activation.h"
-#if defined(MACE_ENABLE_NEON)
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/activation.h"
-#endif
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -45,11 +41,16 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
        epsilon_(Operation::GetOptionalArg<float>("epsilon",
                                                  static_cast<float>(1e-4))),
        activation_delegator_(
-            ops::StringToActivationType(
+            delegator::Activation::Create(
-                Operation::GetOptionalArg<std::string>("activation", "NOOP")),
+                context->workspace(),
-            Operation::GetOptionalArg<float>("max_limit", 0.0f),
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
-            Operation::GetOptionalArg<float>(
+                delegator::ActivationParam(
-                "leakyrelu_coefficient", 0.0f)) {}
+                    ops::StringToActivationType(
+                        Operation::GetOptionalArg<std::string>("activation",
+                                                               "NOOP")),
+                    Operation::GetOptionalArg<float>("max_limit", 0.0f),
+                    Operation::GetOptionalArg<float>("leakyrelu_coefficient",
+                                                     0.0f)))) {}
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -142,18 +143,14 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
      }, 0, batch, 1, 0, channels, 1);
    }
-    activation_delegator_.Compute(context, output, output);
+    activation_delegator_->Compute(context, output, output);
    return MaceStatus::MACE_SUCCESS;
  }
 private:
  float epsilon_;
-#ifdef MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
 protected:
  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
@@ -232,7 +229,7 @@ class BatchNormOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterBatchNorm(OpRegistryBase *op_registry) {
+void RegisterBatchNorm(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
                   DeviceType::CPU, float);
  MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -15,7 +15,8 @@
 #include <algorithm>
 #include <memory>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/batch_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -285,7 +286,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
+void RegisterBatchToSpaceND(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                   BatchToSpaceNDOp, DeviceType::CPU, float);

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -16,14 +16,10 @@
 #include <memory>
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/ops/activation.h"
+#include "mace/ops/delegator/bias_add.h"
-#ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp32/bias_add.h"
-#else
-#include "mace/ops/ref/bias_add.h"
-#endif  // MACE_ENABLE_NEON
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -42,8 +38,11 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit BiasAddOp(OpConstructContext *context)
      : Operation(context),
-        has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)),
-                                                        0)) {}
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -56,7 +55,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
      MACE_CHECK(bias->dim_size() == 1 || bias->dim_size() == 2,
                 "bias must be 1-dimensional or n*c for caffee.",
                 MakeString(bias->shape()));
-      bias_add_delegator_.Compute(context, input, bias, output);
+      bias_add_delegator_->Compute(context, input, bias, output);
    } else {  // NHWC
      MACE_CHECK(bias->dim_size() == 1 || bias->dim_size() == 2,
                 "bias must be 1 or 2 dimensionals for caffee.",
@@ -115,11 +114,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 private:
  int has_data_format_;
-#ifdef MACE_ENABLE_NEON
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
-  arm::fp32::BiasAdd bias_add_delegator_;
-#else
-  ref::BiasAdd bias_add_delegator_;
-#endif  // MACE_ENABLE_NEON
 };
 #ifdef MACE_ENABLE_OPENCL
@@ -164,7 +159,7 @@ class BiasAddOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterBiasAdd(OpRegistryBase *op_registry) {
+void RegisterBiasAdd(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
                   DeviceType::CPU, float);
  MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);

--- a/mace/ops/cast.cc
+++ b/mace/ops/cast.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
 #include <arm_neon.h>
@@ -54,7 +55,7 @@ class CastOp : public Operation {
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
-void RegisterCast(OpRegistryBase *op_registry) {
+void RegisterCast(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Cast", CastOp,
                   DeviceType::CPU, float);
  MACE_REGISTER_OP(op_registry, "Cast", CastOp,

--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -14,7 +14,8 @@
 #include <memory>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/channel_shuffle.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -98,7 +99,7 @@ class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterChannelShuffle(OpRegistryBase *op_registry) {
+void RegisterChannelShuffle(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
                   ChannelShuffleOp, DeviceType::CPU, float);

--- a/mace/ops/common/lstm.h
+++ b/mace/ops/common/lstm.h
@@ -15,8 +15,8 @@
 #ifndef MACE_OPS_COMMON_LSTM_H_
 #define MACE_OPS_COMMON_LSTM_H_
+#include "mace/core/ops/op_context.h"
 #include "mace/core/types.h"
-#include "mace/core/op_context.h"
 namespace mace {
 namespace ops {

--- a/mace/ops/common/transpose.h
+++ b/mace/ops/common/transpose.h
@@ -20,7 +20,7 @@
 #endif  // MACE_ENABLE_NEON
 #include <algorithm>
 #include <vector>
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
 #include "mace/public/mace.h"
 namespace mace {

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -14,7 +14,8 @@
 #include <memory>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/quantize.h"
 #include "mace/utils/memory.h"
@@ -221,7 +222,7 @@ class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterConcat(OpRegistryBase *op_registry) {
+void RegisterConcat(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                   DeviceType::CPU, float);

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -24,32 +24,18 @@
 #include <vector>
 #include "mace/core/future.h"
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/activation.h"
+#include "mace/ops/delegator/bias_add.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
-#ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp32/conv_2d.h"
-#include "mace/ops/arm/fp32/conv_2d_1x1.h"
-#include "mace/ops/arm/fp32/conv_2d_3x3.h"
-#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
-#include "mace/ops/arm/fp32/conv_2d_5x5.h"
-#include "mace/ops/arm/fp32/conv_2d_7x7.h"
-#include "mace/ops/arm/fp32/conv_2d_1xn.h"
-#include "mace/ops/arm/fp32/conv_general.h"
-#include "mace/ops/arm/fp32/bias_add.h"
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/activation.h"
-#include "mace/ops/ref/bias_add.h"
-#endif  // MACE_ENABLE_NEON
-#include "mace/ops/ref/conv_2d.h"
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/common/gemmlowp_util.h"
 #include "mace/ops/arm/q8/quantization_util.h"
@@ -72,13 +58,21 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 public:
  explicit Conv2dOp(OpConstructContext *context)
      : ConvPool2dOpBase(context),
-        activation_delegator_(ops::StringToActivationType(
+        activation_delegator_(
-            Operation::GetOptionalArg<std::string>("activation",
+            delegator::Activation::Create(
-                                                   "NOOP")),
+                context->workspace(),
-                              Operation::GetOptionalArg<float>("max_limit",
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
-                                                               0.0f),
+                delegator::ActivationParam(
-                              Operation::GetOptionalArg<float>(
+                    ops::StringToActivationType(
-                                  "leakyrelu_coefficient", 0.0f)) {}
+                        Operation::GetOptionalArg<std::string>("activation",
+                                                               "NOOP")),
+                    Operation::GetOptionalArg<float>("max_limit", 0.0f),
+                    Operation::GetOptionalArg<float>("leakyrelu_coefficient",
+                                                     0.0f)))),
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(INPUT);
@@ -86,116 +80,100 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
    Tensor *output = this->Output(OUTPUT);
-#ifdef MACE_ENABLE_NEON
-    // the following params are used to decide which conv delegator to use
-    const index_t stride_h = strides_[0];
-    const index_t stride_w = strides_[1];
-    const index_t dilation_h = dilations_[0];
-    const index_t dilation_w = dilations_[1];
-    const index_t filter_h = filter->dim(2);
-    const index_t filter_w = filter->dim(3);
-    const index_t input_channels = input->dim(1);
-    const index_t channels = filter->dim(0);
-    // NOTE: delegator is fixed after first round of running,
-    // although winograd depends on input params.
-    // We do not support changeable filter for now.
    if (conv2d_delegator_ == nullptr) {
-      if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
+      std::string tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
-          && dilation_h == 1 && dilation_w == 1) {
+                                              MACE_CPU_IMPL_TYPE, General);
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x1>(
+      if (MACE_CPU_IMPL_TYPE == NEON) {
-            paddings_, padding_type_);
+        // the following params are used to decide which conv delegator to use
-      } else if (filter_h == 3 && filter_w == 3
+        const index_t stride_h = strides_[0];
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
+        const index_t stride_w = strides_[1];
-          && dilation_w == 1) {
+        const index_t dilation_h = dilations_[0];
-        if (input_channels >= 8 && channels >= 8) {
+        const index_t dilation_w = dilations_[1];
-          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3Winograd>(
+        const index_t filter_h = filter->dim(2);
-              paddings_, padding_type_);
+        const index_t filter_w = filter->dim(3);
-        } else {
+        const index_t input_channels = input->dim(1);
-          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S1>(
+        const index_t channels = filter->dim(0);
-              paddings_, padding_type_);
+        // NOTE: delegator is fixed after first round of running,
+        // although winograd depends on input params.
+        // We do not support changeable filter for now.
+        if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
+            && dilation_h == 1 && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K1x1);
+        } else if (filter_h == 3 && filter_w == 3
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          if (input_channels >= 8 && channels >= 8) {
+            tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3Winograd);
+          } else {
+            tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                        MACE_CPU_IMPL_TYPE, K3x3S1);
+          }
+        } else if (filter_h == 3 && filter_w == 3
+            && stride_h == 2 && stride_w == 2 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K3x3S2);
+        } else if (filter_h == 5 && filter_w == 5
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K5x5S1);
+        } else if (filter_h == 7 && filter_w == 7
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x7S1);
+        } else if (filter_h == 7 && filter_w == 7
+            && stride_h == 2 && stride_w == 2 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x7S2);
+        } else if (filter_h == 7 && filter_w == 7
+            && stride_h == 3 && stride_w == 3 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x7S3);
+        } else if (filter_h == 1 && filter_w == 7
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K1x7S1);
+        } else if (filter_h == 7 && filter_w == 1
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K7x1S1);
+        } else if (filter_h == 1 && filter_w == 15
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K1x15S1);
+        } else if (filter_h == 15 && filter_w == 1
+            && stride_h == 1 && stride_w == 1 && dilation_h == 1
+            && dilation_w == 1) {
+          tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
+                                      MACE_CPU_IMPL_TYPE, K15x1S1);
        }
-      } else if (filter_h == 3 && filter_w == 3
-          && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S2>(
-            paddings_, padding_type_);
-      } else if (filter_h == 5 && filter_w == 5
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK5x5S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 7
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 7
-          && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S2>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 7
-          && stride_h == 3 && stride_w == 3 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S3>(
-            paddings_, padding_type_);
-      } else if (filter_h == 1 && filter_w == 7
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x7S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 7 && filter_w == 1
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x1S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 1 && filter_w == 15
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x15S1>(
-            paddings_, padding_type_);
-      } else if (filter_h == 15 && filter_w == 1
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK15x1S1>(
-            paddings_, padding_type_);
-      } else {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dGeneral>(
-            strides_,
-            dilations_,
-            paddings_,
-            padding_type_);
      }
+      delegator::Conv2dParam param(strides_, dilations_,
+                                   paddings_, padding_type_);
+      conv2d_delegator_ = delegator::Conv2d::Create(context->workspace(),
+                                                    tag, param);
    }
    conv2d_delegator_->Compute(context, input, filter, output);
-#else
+    bias_add_delegator_->Compute(context, output, bias, output);
-    if (ref_conv2d_delegator_ == nullptr) {
+    activation_delegator_->Compute(context, output, output);
-      ref_conv2d_delegator_ = make_unique<ref::Conv2d<float>>(strides_,
-                                                              dilations_,
-                                                              paddings_,
-                                                              padding_type_);
-    }
-    ref_conv2d_delegator_->Compute(context, input, filter, output);
-#endif
-    bias_add_delegator_.Compute(context, output, bias, output);
-    activation_delegator_.Compute(context, output, output);
    return MaceStatus::MACE_SUCCESS;
  }
 private:
-#ifdef MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
-  std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
-  arm::fp32::BiasAdd bias_add_delegator_;
+  std::unique_ptr<delegator::Conv2d> conv2d_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  std::unique_ptr<ref::Conv2d<float>> ref_conv2d_delegator_;
-  ref::BiasAdd bias_add_delegator_;
-  ref::Activation activation_delegator_;
-#endif  // MACE_ENABLE_NEON
 private:
  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -518,7 +496,7 @@ class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterConv2D(OpRegistryBase *op_registry) {
+void RegisterConv2D(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
                   DeviceType::CPU, float);

--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -17,7 +17,7 @@
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 namespace mace {

--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -14,7 +14,8 @@
 #include <memory>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/utils/math.h"
 #include "mace/utils/memory.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -132,7 +133,7 @@ class CropOp<DeviceType::GPU, float> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterCrop(OpRegistryBase *op_registry) {
+void RegisterCrop(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
                   DeviceType::CPU, float);
  MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);

--- a/mace/ops/cumsum.cc
+++ b/mace/ops/cumsum.cc
@@ -14,7 +14,8 @@
 #include <functional>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
+#include "mace/core/registry/ops_registry.h"
 namespace mace {
 namespace ops {
@@ -141,7 +142,7 @@ class CumsumOp<DeviceType::CPU, T> : public Operation {
  bool checked_;
 };
-void RegisterCumsum(OpRegistryBase *op_registry) {
+void RegisterCumsum(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp,
                   DeviceType::CPU, float);
 }

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -14,20 +14,6 @@
 #include "mace/ops/deconv_2d.h"
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#include "mace/ops/arm/fp32/deconv_2d_2x2.h"
-#include "mace/ops/arm/fp32/deconv_2d_3x3.h"
-#include "mace/ops/arm/fp32/deconv_2d_4x4.h"
-#include "mace/ops/arm/fp32/deconv_2d_general.h"
-#include "mace/ops/arm/fp32/bias_add.h"
-#include "mace/ops/arm/fp32/activation.h"
-#else
-#include "mace/ops/ref/bias_add.h"
-#include "mace/ops/ref/activation.h"
-#include "mace/ops/ref/deconv_2d.h"
-#endif
 #include <algorithm>
 #include <functional>
 #include <memory>
@@ -35,9 +21,13 @@
 #include <vector>
 #include "mace/core/future.h"
+#include "mace/core/registry/ops_registry.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/activation.h"
+#include "mace/ops/delegator/bias_add.h"
+#include "mace/ops/delegator/deconv_2d.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
@@ -49,6 +39,10 @@
 namespace mace {
 namespace ops {
+namespace {
+const std::vector<int> kDeconv2dStrides = {1, 1};
+}
 template<DeviceType D, class T>
 class Deconv2dOp;
@@ -57,9 +51,16 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
 public:
  explicit Deconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context),
-        activation_delegator_(activation_,
+        activation_delegator_(
-                              relux_max_limit_,
+            delegator::Activation::Create(
-                              leakyrelu_coefficient_) {}
+                context->workspace(),
+                MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
+                delegator::ActivationParam(activation_, relux_max_limit_,
+                                           leakyrelu_coefficient_))),
+        bias_add_delegator_(delegator::BiasAdd::Create(
+            context->workspace(),
+            MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
+            DelegatorParam())) {}
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);
@@ -79,91 +80,67 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);
-#ifdef MACE_ENABLE_NEON
-    const index_t kernel_h = filter->dim(2);
-    const index_t kernel_w = filter->dim(3);
-    bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 &&
+    if (deconv2d_delegator_ == nullptr) {
-        strides_[0] == strides_[1] && strides_[0] == 1;
+      std::string tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
-    bool use_neon_2x2_s2 = kernel_h == kernel_w && kernel_h == 2 &&
+                                              MACE_CPU_IMPL_TYPE, General);
-        strides_[0] == strides_[1] && strides_[0] == 2;
+      if (MACE_CPU_IMPL_TYPE == NEON) {
+        const index_t kernel_h = filter->dim(2);
+        const index_t kernel_w = filter->dim(3);
-    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
+        bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
+            strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
+        bool use_neon_2x2_s2 = kernel_h == kernel_w && kernel_h == 2 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
+            strides_[0] == strides_[1] && strides_[0] == 2;
-    bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
+        bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
+            strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
+        bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
+            strides_[0] == strides_[1] && strides_[0] == 2;
-    if (deconv2d_delegator_ == nullptr) {
+        bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
-      if (use_neon_2x2_s1) {
+            strides_[0] == strides_[1] && strides_[0] == 1;
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK2x2S1>(
+        bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
-            paddings_, padding_type_, model_type_);
+            strides_[0] == strides_[1] && strides_[0] == 2;
-      } else if (use_neon_2x2_s2) {
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK2x2S2>(
+        if (use_neon_2x2_s1) {
-            paddings_, padding_type_, model_type_);
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
-      } else if (use_neon_3x3_s1) {
+                                      MACE_CPU_IMPL_TYPE, K2x2S1);
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK3x3S1>(
+        } else if (use_neon_2x2_s2) {
-            paddings_, padding_type_, model_type_);
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
-      } else if (use_neon_3x3_s2) {
+                                      MACE_CPU_IMPL_TYPE, K2x2S2);
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK3x3S2>(
+        } else if (use_neon_3x3_s1) {
-            paddings_, padding_type_, model_type_);
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
-      } else if (use_neon_4x4_s1) {
+                                      MACE_CPU_IMPL_TYPE, K3x3S1);
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK4x4S1>(
+        } else if (use_neon_3x3_s2) {
-            paddings_, padding_type_, model_type_);
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
-      } else if (use_neon_4x4_s2) {
+                                      MACE_CPU_IMPL_TYPE, K3x3S2);
-        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK4x4S2>(
+        } else if (use_neon_4x4_s1) {
-            paddings_, padding_type_, model_type_);
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
-      } else {
+                                      MACE_CPU_IMPL_TYPE, K4x4S1);
-        deconv2d_delegator_ =
+        } else if (use_neon_4x4_s2) {
-            make_unique<arm::fp32::Deconv2dGeneral>(strides_,
+          tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
-                                                    std::vector<int>{1, 1},
+                                      MACE_CPU_IMPL_TYPE, K4x4S2);
-                                                    paddings_,
+        }
-                                                    padding_type_,
-                                                    model_type_);
      }
+      delegator::Deconv2dParam param(strides_, kDeconv2dStrides, paddings_,
+                                     padding_type_, model_type_);
+      deconv2d_delegator_ = delegator::Deconv2d::Create(context->workspace(),
+                                                        tag, param);
    }
-    deconv2d_delegator_->Compute(context,
-                                 input,
-                                 filter,
-                                 output_shape_tensor,
-                                 output);
-#else
-    if (deconv2d_delegator_ == nullptr) {
-      deconv2d_delegator_ = make_unique<ref::Deconv2d<float>>(strides_,
-                                                              std::vector<int>{
-                                                                  1, 1},
-                                                              paddings_,
-                                                              padding_type_,
-                                                              model_type_);
-    }
-    deconv2d_delegator_->Compute(context,
-                                 input,
-                                 filter,
-                                 output_shape_tensor,
-                                 output);
-#endif  // MACE_ENABLE_NEON
-    bias_add_delegator_.Compute(context, output, bias, output);
+    deconv2d_delegator_->Compute(context, input, filter,
-    activation_delegator_.Compute(context, output, output);
+                                 output_shape_tensor, output);
+    bias_add_delegator_->Compute(context, output, bias, output);
+    activation_delegator_->Compute(context, output, output);
    return MaceStatus::MACE_SUCCESS;
  }
 private:
-#ifdef MACE_ENABLE_NEON
+  std::unique_ptr<delegator::Activation> activation_delegator_;
-  std::unique_ptr<arm::fp32::Deconv2dBase> deconv2d_delegator_;
+  std::unique_ptr<delegator::BiasAdd> bias_add_delegator_;
-  arm::fp32::BiasAdd bias_add_delegator_;
+  std::unique_ptr<delegator::Deconv2d> deconv2d_delegator_;
-  arm::fp32::Activation activation_delegator_;
-#else
-  ref::BiasAdd bias_add_delegator_;
-  ref::Activation activation_delegator_;
-  std::unique_ptr<ref::Deconv2d<float>> deconv2d_delegator_;
-#endif  // MACE_ENABLE_NEON
 };
 #ifdef MACE_ENABLE_OPENCL
@@ -258,7 +235,7 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
-void RegisterDeconv2D(OpRegistryBase *op_registry) {
+void RegisterDeconv2D(OpRegistry *op_registry) {
  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
                   DeviceType::CPU, float);
  MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);

--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/ops/operator.h"
 #include "mace/core/types.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/common/conv_pool_2d_util.h"

--- a/mace/ops/arm/fp32/activation.h
+++ b/mace/ops/arm/fp32/activation.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,42 +12,50 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_ACTIVATION_H_
+#ifndef MACE_OPS_DELEGATOR_ACTIVATION_H_
-#define MACE_OPS_ARM_FP32_ACTIVATION_H_
+#define MACE_OPS_DELEGATOR_ACTIVATION_H_
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
 #include "mace/ops/common/activation_type.h"
 namespace mace {
 namespace ops {
-namespace arm {
+namespace delegator {
-namespace fp32 {
-class Activation {
+struct ActivationParam : public DelegatorParam {
+  explicit ActivationParam(ActivationType type, const float limit,
+                           const float leakyrelu_coefficient)
+      : type_(type), limit_(limit),
+        leakyrelu_coefficient_(leakyrelu_coefficient) {}
+  ActivationType type_;
+  const float limit_;
+  const float leakyrelu_coefficient_;
+};
+class Activation : public OpDelegator {
 public:
-  explicit Activation(ActivationType type,
+  explicit Activation(const ActivationParam &param)
-                      const float limit,
+      : OpDelegator(param), type_(param.type_), limit_(param.limit_),
-                      const float leakyrelu_coefficient);
+        leakyrelu_coefficient_(param.leakyrelu_coefficient_) {}
-  ~Activation() = default;
+  virtual ~Activation() = default;
-  MaceStatus Compute(
+  MACE_DEFINE_DELEGATOR_CREATOR(Activation)
-      const OpContext *context,
-      const Tensor *input,
-      Tensor *output);
- private:
+  virtual MaceStatus Compute(const OpContext *context,
-  void DoActivation(const OpContext *context,
+                             const Tensor *input,
-                    const Tensor *input,
+                             Tensor *output) = 0;
-                    Tensor *output);
+ protected:
  ActivationType type_;
  const float limit_;
  const float leakyrelu_coefficient_;
 };
-}  // namespace fp32
+}  // namespace delegator
-}  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_ACTIVATION_H_
+#endif  // MACE_OPS_DELEGATOR_ACTIVATION_H_
--- a/mace/ops/arm/fp32/bias_add.h
+++ b/mace/ops/arm/fp32/bias_add.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,37 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_BIAS_ADD_H_
+#ifndef MACE_OPS_DELEGATOR_BIAS_ADD_H_
-#define MACE_OPS_ARM_FP32_BIAS_ADD_H_
+#define MACE_OPS_DELEGATOR_BIAS_ADD_H_
-#include "mace/core/op_context.h"
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
 namespace mace {
 namespace ops {
-namespace arm {
+namespace delegator {
-namespace fp32 {
-class BiasAdd {
+class BiasAdd : public OpDelegator {
 public:
-  BiasAdd() = default;
+  explicit BiasAdd(const DelegatorParam &param) : OpDelegator(param) {}
-  ~BiasAdd() = default;
+  virtual ~BiasAdd() = default;
-  MaceStatus Compute(
+  MACE_DEFINE_DELEGATOR_CREATOR(BiasAdd)
-      const OpContext *context,
-      const Tensor *input,
+  virtual MaceStatus Compute(const OpContext *context,
-      const Tensor *bias,
+                             const Tensor *input,
-      Tensor *output);
+                             const Tensor *bias,
+                             Tensor *output) = 0;
- private:
-  void AddBias(const OpContext *context,
-               const Tensor *input,
-               const Tensor *bias,
-               Tensor *output);
 };
-}  // namespace fp32
+}  // namespace delegator
-}  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_BIAS_ADD_H_
+#endif  // MACE_OPS_DELEGATOR_BIAS_ADD_H_
--- a/mace/ops/delegator/conv_2d.h
+++ b/mace/ops/delegator/conv_2d.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_DELEGATOR_CONV_2D_H_
+#define MACE_OPS_DELEGATOR_CONV_2D_H_
+#include <vector>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+namespace mace {
+namespace ops {
+enum ConvType {
+  General,
+  K1x1,
+  K1x7S1,
+  K7x1S1,
+  K1x15S1,
+  K15x1S1,
+  K3x3S1,
+  K3x3S2,
+  K3x3Winograd,
+  K5x5S1,
+  K7x7S1,
+  K7x7S2,
+  K7x7S3,
+};
+namespace delegator {
+struct Conv2dParam : public DelegatorParam {
+  explicit Conv2dParam(const std::vector<int> &strides,
+                       const std::vector<int> &dilations,
+                       const std::vector<int> &paddings,
+                       const Padding padding_type)
+      : strides_(strides), dilations_(dilations),
+        paddings_(paddings), padding_type_(padding_type) {}
+  const std::vector<int> &strides_;
+  const std::vector<int> &dilations_;
+  const std::vector<int> &paddings_;
+  const Padding padding_type_;
+};
+class Conv2d : public OpDelegator {
+ public:
+  explicit Conv2d(const delegator::Conv2dParam &param)
+      : OpDelegator(param),
+        strides_(param.strides_),
+        dilations_(param.dilations_),
+        paddings_(param.paddings_),
+        padding_type_(param.padding_type_) {}
+  virtual ~Conv2d() = default;
+  MACE_DEFINE_DELEGATOR_CREATOR(Conv2d)
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             Tensor *output) = 0;
+ protected:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+};
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_DELEGATOR_CONV_2D_H_
--- a/mace/ops/delegator/deconv_2d.h
+++ b/mace/ops/delegator/deconv_2d.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_DELEGATOR_DECONV_2D_H_
+#define MACE_OPS_DELEGATOR_DECONV_2D_H_
+#include <vector>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/ops/op_delegator.h"
+#include "mace/core/registry/op_delegator_registry.h"
+namespace mace {
+namespace ops {
+enum DeconvType {
+  General,
+  K2x2S1,
+  K2x2S2,
+  K3x3S1,
+  K3x3S2,
+  K4x4S1,
+  K4x4S2,
+};
+namespace delegator {
+struct Deconv2dParam : public DelegatorParam {
+  explicit Deconv2dParam(const std::vector<int> &strides,
+                         const std::vector<int> &dilations,
+                         const std::vector<int> &paddings,
+                         const Padding padding_type,
+                         const FrameworkType framework_type,
+                         const int group = 1)
+      : strides_(strides), dilations_(dilations),
+        paddings_(paddings), padding_type_(padding_type),
+        framework_type_(framework_type),
+        group_(group) {}
+  const std::vector<int> &strides_;
+  const std::vector<int> &dilations_;
+  const std::vector<int> &paddings_;
+  const Padding padding_type_;
+  const FrameworkType framework_type_;
+  const int group_;
+};
+class Deconv2d : public OpDelegator {
+ public:
+  explicit Deconv2d(const Deconv2dParam &param)
+      : OpDelegator(param),
+        strides_(param.strides_),
+        dilations_(param.dilations_),
+        paddings_(param.paddings_),
+        padding_type_(param.padding_type_),
+        framework_type_(param.framework_type_),
+        group_(param.group_) {}
+  virtual ~Deconv2d() = default;
+  MACE_DEFINE_DELEGATOR_CREATOR(Deconv2d)
+  virtual MaceStatus Compute(const OpContext *context,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *output_shape,
+                             Tensor *output) = 0;
+ protected:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+  const FrameworkType framework_type_;
+  const int group_;
+};
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_DELEGATOR_DECONV_2D_H_
--- a/mace/ops/ref/bias_add.h
+++ b/mace/ops/ref/bias_add.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,35 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_REF_BIAS_ADD_H_
-#define MACE_OPS_REF_BIAS_ADD_H_
-#include "mace/core/op_context.h"
+#ifndef MACE_OPS_DELEGATOR_DEPTHWISE_CONV_2D_H_
+#define MACE_OPS_DELEGATOR_DEPTHWISE_CONV_2D_H_
+#include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
-namespace ref {
+namespace delegator {
-class BiasAdd {
+typedef Conv2dParam DepthwiseConv2dParam;
- public:
+typedef Conv2d DepthwiseConv2d;
-  BiasAdd() = default;
-  ~BiasAdd() = default;
+}  // namespace delegator
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *bias,
-      Tensor *output);
- private:
-  void AddBias(const OpContext *context,
-               const Tensor *input,
-               const Tensor *bias,
-               Tensor *output);
-};
-}  // namespace ref
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_REF_BIAS_ADD_H_
+#endif  // MACE_OPS_DELEGATOR_DEPTHWISE_CONV_2D_H_
--- a/mace/ops/delegator/depthwise_deconv_2d.h
+++ b/mace/ops/delegator/depthwise_deconv_2d.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_DELEGATOR_DEPTHWISE_DECONV_2D_H_
+#define MACE_OPS_DELEGATOR_DEPTHWISE_DECONV_2D_H_
+#include "mace/ops/delegator/deconv_2d.h"
+namespace mace {
+namespace ops {
+namespace delegator {
+typedef Deconv2dParam DepthwiseDeconv2dParam;
+typedef Deconv2dParam GroupDeconv2dParam;
+typedef Deconv2d DepthwiseDeconv2d;
+typedef Deconv2d GroupDeconv2d;
+}  // namespace delegator
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_DELEGATOR_DEPTHWISE_DECONV_2D_H_
--- a/mace/ops/arm/q8/eltwise.h
+++ b/mace/ops/arm/q8/eltwise.h
--- a/mace/ops/delegator/gemm.h
+++ b/mace/ops/delegator/gemm.h
--- a/mace/ops/delegator/gemv.h
+++ b/mace/ops/delegator/gemv.h
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
--- a/mace/ops/dynamic_lstm.cc
+++ b/mace/ops/dynamic_lstm.cc
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
--- a/mace/ops/extract_pooling.cc
+++ b/mace/ops/extract_pooling.cc
--- a/mace/ops/fill.cc
+++ b/mace/ops/fill.cc
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
--- a/mace/ops/gather.cc
+++ b/mace/ops/gather.cc
--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
--- a/mace/ops/ifdefined.cc
+++ b/mace/ops/ifdefined.cc
--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
--- a/mace/ops/kaldi_batch_norm.cc
+++ b/mace/ops/kaldi_batch_norm.cc
--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
--- a/mace/ops/lpnorm.cc
+++ b/mace/ops/lpnorm.cc
--- a/mace/ops/lstm_nonlinear.cc
+++ b/mace/ops/lstm_nonlinear.cc
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
--- a/mace/ops/mvnorm.cc
+++ b/mace/ops/mvnorm.cc
--- a/mace/ops/one_hot.cc
+++ b/mace/ops/one_hot.cc
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
--- a/mace/ops/opencl/buffer/reshape.cc
+++ b/mace/ops/opencl/buffer/reshape.cc
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
--- a/mace/ops/opencl/buffer/utils.h
+++ b/mace/ops/opencl/buffer/utils.h
--- a/mace/ops/opencl/buffer_transform.cc
+++ b/mace/ops/opencl/buffer_transform.cc
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
--- a/mace/ops/opencl/image/lpnorm.h
+++ b/mace/ops/opencl/image/lpnorm.h
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
--- a/mace/ops/opencl/image/mvnorm.h
+++ b/mace/ops/opencl/image/mvnorm.h
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
--- a/mace/ops/opencl/image/reshape.h
+++ b/mace/ops/opencl/image/reshape.h
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
--- a/mace/ops/opencl/lstm_cell.cc
+++ b/mace/ops/opencl/lstm_cell.cc
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
--- a/mace/ops/pad_context.cc
+++ b/mace/ops/pad_context.cc
--- a/mace/ops/pnorm.cc
+++ b/mace/ops/pnorm.cc
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
--- a/mace/ops/prior_box.cc
+++ b/mace/ops/prior_box.cc
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
--- a/mace/ops/ref/activation.cc
+++ b/mace/ops/ref/activation.cc
--- a/mace/ops/ref/bias_add.cc
+++ b/mace/ops/ref/bias_add.cc
--- a/mace/ops/ref/conv_2d.cc
+++ b/mace/ops/ref/conv_2d.cc
--- a/mace/ops/ref/conv_2d.h
+++ b/mace/ops/ref/conv_2d.h
--- a/mace/ops/ref/deconv_2d.cc
+++ b/mace/ops/ref/deconv_2d.cc
--- a/mace/ops/ref/deconv_2d.h
+++ b/mace/ops/ref/deconv_2d.h
--- a/mace/ops/ref/depthwise_conv_2d.cc
+++ b/mace/ops/ref/depthwise_conv_2d.cc
--- a/mace/ops/ref/depthwise_conv_2d.h
+++ b/mace/ops/ref/depthwise_conv_2d.h
--- a/mace/ops/ref/depthwise_deconv_2d.cc
+++ b/mace/ops/ref/depthwise_deconv_2d.cc
--- a/mace/ops/ref/depthwise_deconv_2d.h
+++ b/mace/ops/ref/depthwise_deconv_2d.h
--- a/mace/ops/ref/gemm.cc
+++ b/mace/ops/ref/gemm.cc
--- a/mace/ops/ref/gemm.h
+++ b/mace/ops/ref/gemm.h
--- a/mace/ops/ref/gemv.cc
+++ b/mace/ops/ref/gemv.cc
--- a/mace/ops/ref/gemv.h
+++ b/mace/ops/ref/gemv.h
--- a/mace/ops/ref/q8/eltwise.cc
+++ b/mace/ops/ref/q8/eltwise.cc
--- a/mace/ops/registry/op_delegators_registry.cc
+++ b/mace/ops/registry/op_delegators_registry.cc
--- a/mace/ops/registry/ops_registry.cc
+++ b/mace/ops/registry/ops_registry.cc
--- a/mace/ops/registry/ops_registry.h
+++ b/mace/ops/registry/ops_registry.h
--- a/mace/ops/replace_index.cc
+++ b/mace/ops/replace_index.cc
--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
--- a/mace/ops/reverse.cc
+++ b/mace/ops/reverse.cc
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
--- a/mace/ops/select.cc
+++ b/mace/ops/select.cc
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
--- a/mace/ops/splice.cc
+++ b/mace/ops/splice.cc
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
--- a/mace/ops/subsample.cc
+++ b/mace/ops/subsample.cc
--- a/mace/ops/sum_group.cc
+++ b/mace/ops/sum_group.cc
--- a/mace/ops/target_rms_norm.cc
+++ b/mace/ops/target_rms_norm.cc
--- a/mace/ops/tile.cc
+++ b/mace/ops/tile.cc
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
--- a/mace/ops/unsqueeze.cc
+++ b/mace/ops/unsqueeze.cc
--- a/mace/ops/unstack.cc
+++ b/mace/ops/unstack.cc
--- a/test/ccbenchmark/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/depthwise_deconv2d_benchmark.cc
--- a/test/ccbenchmark/mace/ops/quantize_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/quantize_benchmark.cc
--- a/test/ccbenchmark/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/sqrdiff_mean_benchmark.cc
--- a/test/ccunit/mace/ops/arm/fp32/gemm_test.cc
+++ b/test/ccunit/mace/ops/arm/fp32/gemm_test.cc
--- a/test/ccunit/mace/ops/arm/fp32/gemv_test.cc
+++ b/test/ccunit/mace/ops/arm/fp32/gemv_test.cc
--- a/test/ccunit/mace/ops/arm/q8/gemv_test.cc
+++ b/test/ccunit/mace/ops/arm/q8/gemv_test.cc
--- a/test/ccunit/mace/ops/matmul_test.cc
+++ b/test/ccunit/mace/ops/matmul_test.cc
--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
--- a/test/ccunit/mace/ops/sqrdiff_mean_test.cc
+++ b/test/ccunit/mace/ops/sqrdiff_mean_test.cc
--- a/test/ccutils/mace/ops/ops_test_util.h
+++ b/test/ccutils/mace/ops/ops_test_util.h