refactor: refactor the delegators for arm

N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>

refactor: refactor the delegators for arm
N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>
9aff3c14 · luxuhui · fbd0ff09 · 9aff3c14 · 9aff3c14 · 9aff3c14
66 changed file
--- a/mace/core/registry/op_delegator_registry.cc
+++ b/mace/core/registry/op_delegator_registry.cc
@@ -60,6 +60,7 @@ MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key,
 DelegatorCreator OpDelegatorRegistry::GetCreator(
    const DelegatorInfo &key) const {
  if (registry_.count(key) > 0) {
+    VLOG(3) << "find delegator creator: " << key.ToString();
    return registry_.at(key);
  }


--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -105,6 +105,7 @@ cc_library(
    name = "arm_neon_kernels",
    srcs = glob(
        [
+            "arm/base/*.cc",
            "arm/fp32/*.cc",
            "arm/fp16/gemv.h",
        ],
@@ -121,6 +122,7 @@ cc_library(
    )),
    hdrs = glob(
        [
+            "arm/base/*.h",
            "arm/fp32/*.h",
        ],
    ) + if_quantize_enabled(glob(

--- a/mace/ops/CMakeLists.txt
+++ b/mace/ops/CMakeLists.txt
@@ -5,6 +5,9 @@ file(GLOB OPS_REF_Q8_KERNELS_SRCS
  ref/q8/*.cc
 )

+file(GLOB OPS_ARM_NEON_BASE_KERNELS_SRCS
+  arm/base/*.cc
+)
 file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS
  arm/fp32/*.cc
 )
@@ -32,7 +35,7 @@ if(MACE_ENABLE_QUANTIZE)
 endif(MACE_ENABLE_QUANTIZE)

 if(MACE_ENABLE_NEON)
-  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
+  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BASE_KERNELS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
  if(MACE_ENABLE_QUANTIZE)
    set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
  endif(MACE_ENABLE_QUANTIZE)

--- a/mace/ops/arm/base/activation.cc
+++ b/mace/ops/arm/base/activation.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/activation.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+MaceStatus Activation<T>::Compute(const OpContext *context,
+                                  const Tensor *input, Tensor *output) {
+  Tensor::MappingGuard input_guard(input);
+  if (input != output) {
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    Tensor::MappingGuard output_guard(output);
+    DoActivation(context, input, output);
+  } else {
+    DoActivation(context, input, output);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+template<typename T>
+void Activation<T>::DoActivation(const OpContext *context,
+                                 const Tensor *input,
+                                 Tensor *output) {
+  const T *input_data = input->data<T>();
+  T *output_data = output->mutable_data<T>();
+  const index_t size = input->size();
+
+  utils::ThreadPool &thread_pool =
+      context->device()->cpu_runtime()->thread_pool();
+
+  switch (type_) {
+    case RELU: {
+      ActivateRelu(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case RELUX: {
+      ActivateRelux(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case LEAKYRELU: {
+      ActivateLeakyRelu(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case TANH: {
+      ActivateTanh(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case SIGMOID: {
+      ActivateSigmoid(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case NOOP: {
+      break;
+    }
+
+    default: {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+
+void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Activation<float>, delegator::ActivationParam,
+      MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/activation.h
+++ b/mace/ops/arm/base/activation.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_ACTIVATION_H_
+#define MACE_OPS_ARM_BASE_ACTIVATION_H_
+
+#include "mace/ops/delegator/activation.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Activation : public delegator::Activation {
+ public:
+  explicit Activation(const delegator::ActivationParam &param)
+      : delegator::Activation(param) {}
+  ~Activation() = default;
+
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input, Tensor *output) override;
+
+ private:
+  void DoActivation(const OpContext *context,
+                    const Tensor *input, Tensor *output);
+
+  void ActivateRelu(utils::ThreadPool *thread_pool, const T *input_data,
+                    const index_t input_size, T *output_data);
+  void ActivateRelux(utils::ThreadPool *thread_pool, const T *input_data,
+                     const index_t input_size, T *output_data);
+  void ActivateLeakyRelu(utils::ThreadPool *thread_pool, const T *input_data,
+                         const index_t input_size, T *output_data);
+  void ActivateTanh(utils::ThreadPool *thread_pool, const T *input_data,
+                    const index_t input_size, T *output_data);
+  void ActivateSigmoid(utils::ThreadPool *thread_pool, const T *input_data,
+                       const index_t input_size, T *output_data);
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_ACTIVATION_H_
--- a/mace/ops/arm/base/bias_add.cc
+++ b/mace/ops/arm/base/bias_add.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/bias_add.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+MaceStatus BiasAdd<T>::Compute(const OpContext *context, const Tensor *input,
+                               const Tensor *bias, Tensor *output) {
+  if (input != output) {
+    if (bias == nullptr) {
+      output->Copy(*input);
+    } else {
+      MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard bias_guard(bias);
+      Tensor::MappingGuard output_guard(output);
+      AddBias(context, input, bias, output);
+    }
+  } else {
+    if (bias != nullptr) {
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard bias_guard(bias);
+      AddBias(context, input, bias, output);
+    }
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+template<typename T>
+void BiasAdd<T>::AddBias(const OpContext *context, const Tensor *input,
+                         const Tensor *bias, mace::Tensor *output) {
+  auto input_data = input->data<T>();
+  auto bias_data = bias->data<T>();
+  auto output_data = output->mutable_data<T>();
+
+  const index_t batch = input->dim(0);
+  const index_t channels = input->dim(1);
+
+  const index_t height = input->dim(2);
+  const index_t width = input->dim(3);
+  const index_t image_size = height * width;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  if (bias->dim_size() == 1) {
+    Add1DimBias(&thread_pool, input_data, bias_data,
+                output_data, batch, channels, image_size);
+  } else {
+    Add2DimsBias(&thread_pool, input_data, bias_data,
+                     output_data, batch, channels, image_size);
+  }
+}
+
+void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, BiasAdd<float>, DelegatorParam,
+      MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/bias_add.h
+++ b/mace/ops/arm/base/bias_add.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_BIAS_ADD_H_
+#define MACE_OPS_ARM_BASE_BIAS_ADD_H_
+
+#include "mace/ops/delegator/bias_add.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class BiasAdd : public delegator::BiasAdd {
+ public:
+  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
+  ~BiasAdd() = default;
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *bias, Tensor *output) override;
+
+ private:
+  void AddBias(const OpContext *context, const Tensor *input,
+               const Tensor *bias, Tensor *output);
+
+  void Add1DimBias(utils::ThreadPool *thread_pool, const T *input_data,
+                   const T *bias_data, T *output_data,
+                   const index_t batch, const index_t channels,
+                   const index_t image_size);
+
+  void Add2DimsBias(utils::ThreadPool *thread_pool, const T *input_data,
+                    const T *bias_data, T *output_data,
+                    const index_t batch, const index_t channels,
+                    const index_t image_size);
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_BIAS_ADD_H_
--- a/mace/ops/arm/fp32/conv_2d.cc
+++ b/mace/ops/arm/fp32/conv_2d.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,18 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d.h"

+#include <algorithm>
 #include <memory>
 #include <utility>
-#include <algorithm>

 #include "mace/utils/memory.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

 void Conv2dBase::CalOutputShapeAndInputPadSize(
    const std::vector<index_t> &input_shape,
@@ -164,10 +163,10 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
  auto scratch_buffer = context->device()->scratch_buffer();
  const index_t padded_in_size =
      MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
-          sizeof(float) * batch * in_channels * padded_in_height
+          type_size_ * batch * in_channels * padded_in_height
              * padded_in_width) : 0);
  const index_t padded_out_size = is_out_padded ? PadAlignSize(
-      sizeof(float) * batch * out_channels * padded_out_height
+      type_size_ * batch * out_channels * padded_out_height
          * padded_out_width) : 0;

  scratch_buffer->Rewind();
@@ -176,7 +175,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
    std::unique_ptr<Tensor>
        padded_in =
        make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
-                            DataType::DT_FLOAT);
+                            input->dtype());
    padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
    PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
    *padded_input = std::move(padded_in);
@@ -185,7 +184,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
    std::unique_ptr<Tensor>
        padded_out =
        make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
-                            DataType::DT_FLOAT);
+                            output->dtype());
    padded_out->Resize({batch, out_channels, padded_out_height,
                        padded_out_width});
    *padded_output = std::move(padded_out);
@@ -206,8 +205,8 @@ void Conv2dBase::PadInput(const Tensor &src,
  const index_t padded_width = dst->dim(3);
  const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
  const int pad_right = static_cast<int>(padded_width - width - pad_left);
-  auto in_data = src.data<float>();
-  auto padded_in_data = dst->mutable_data<float>();
+  auto in_data = src.data<uint8_t>();
+  auto padded_in_data = dst->mutable_data<uint8_t>();

  const index_t img_size = height * width;
  const index_t padded_img_size = padded_height * padded_width;
@@ -215,25 +214,26 @@ void Conv2dBase::PadInput(const Tensor &src,
  for (index_t b = 0; b < batch; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t bc = b * channels + c;
-      const float *in_base = in_data + bc * img_size;
-      float *padded_in_base = padded_in_data + bc * padded_img_size;
+      const uint8_t *in_base = in_data + bc * img_size * type_size_;
+      uint8_t *padded_in_base =
+          padded_in_data + bc * padded_img_size * type_size_;

-      memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
-      padded_in_base += pad_top * padded_width;
+      memset(padded_in_base, 0, type_size_ * pad_top * padded_width);
+      padded_in_base += pad_top * padded_width * type_size_;
      for (index_t h = 0; h < height; ++h) {
        memset(padded_in_base,
               0,
-               sizeof(float) * pad_left);
-        memcpy(padded_in_base + pad_left,
+               type_size_ * pad_left);
+        memcpy(padded_in_base + pad_left * type_size_,
               in_base,
-               sizeof(float) * width);
-        memset(padded_in_base + pad_left + width,
+               type_size_ * width);
+        memset(padded_in_base + (pad_left + width) * type_size_,
               0,
-               sizeof(float) * pad_right);
-        in_base += width;
-        padded_in_base += padded_width;
+               type_size_ * pad_right);
+        in_base += width * type_size_;
+        padded_in_base += padded_width * type_size_;
      }
-      memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
+      memset(padded_in_base, 0, type_size_ * pad_bottom * padded_width);
    }
  }
 }
@@ -247,8 +247,8 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
  const index_t padded_height = src.dim(2);
  const index_t padded_width = src.dim(3);

-  auto padded_out_data = src.data<float>();
-  auto out_data = dst->mutable_data<float>();
+  auto padded_out_data = src.data<uint8_t>();
+  auto out_data = dst->mutable_data<uint8_t>();

  const index_t img_size = height * width;
  const index_t padded_img_size = padded_height * padded_width;
@@ -256,21 +256,93 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
  for (index_t b = 0; b < batch; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t bc = (b * channels + c);
-      float *out_base = out_data + bc * img_size;
-      const float *padded_out_base = padded_out_data + bc * padded_img_size;
+      uint8_t *out_base = out_data + bc * img_size * type_size_;
+      const uint8_t *padded_out_base =
+          padded_out_data + bc * padded_img_size * type_size_;

      for (index_t h = 0; h < height; ++h) {
-        memcpy(out_base,
-               padded_out_base,
-               sizeof(float) * width);
-        out_base += width;
-        padded_out_base += padded_width;
+        memcpy(out_base, padded_out_base, type_size_ * width);
+        out_base += width * type_size_;
+        padded_out_base += padded_width * type_size_;
      }  // h
    }  // c
  }  // b
 }

-}  // namespace fp32
+ConvComputeParam Conv2dBase::PreWorkAndGetConv2DParam(
+    const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return ConvComputeParam(batch, in_channels, in_height, in_width,
+                          out_channels, out_height, out_width,
+                          in_image_size, out_image_size,
+                          in_batch_size, out_batch_size, &thread_pool);
+}
+
+DepthwiseConvComputeParam Conv2dBase::PreWorkAndGetDepthwiseConv2DParam(
+    const OpContext *context, const Tensor *input,
+    const Tensor *filter, Tensor *output) {
+  std::vector<index_t> out_shape(4);
+  std::vector<int> paddings(2);
+  auto &in_shape = input->shape();
+  auto &filter_shape = filter->shape();
+  CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
+  out_shape[1] *= filter_shape[1];
+  MACE_CHECK(output->Resize(out_shape) == MaceStatus::MACE_SUCCESS,
+             "Resize failed.");
+  output->Clear();
+
+  const int pad_top = paddings[0] / 2;
+  const int pad_left = paddings[1] / 2;
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_channels / in_channels;
+
+  std::vector<index_t> out_bounds;
+  CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
+  const index_t valid_h_start = out_bounds[0];
+  const index_t valid_h_stop = out_bounds[1];
+  const index_t valid_w_start = out_bounds[2];
+  const index_t valid_w_stop = out_bounds[3];
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return DepthwiseConvComputeParam(
+      batch, in_channels, in_height, in_width, out_channels, out_height,
+      out_width, in_image_size, out_image_size, in_batch_size, out_batch_size,
+      &thread_pool, pad_top, pad_left, multiplier, valid_h_start, valid_h_stop,
+      valid_w_start, valid_w_stop);
+}
+
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,28 +12,97 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_H_

-#include <vector>
 #include <memory>
+#include <vector>

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/delegator/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/arm/base/gemm.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+
+struct ConvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+
+  const index_t in_image_size;
+  const index_t out_image_size;
+  const index_t in_batch_size;
+  const index_t out_batch_size;
+
+  utils::ThreadPool &thread_pool;
+
+  ConvComputeParam(const index_t b,
+                   const index_t in_c,
+                   const index_t in_h,
+                   const index_t in_w,
+                   const index_t out_c,
+                   const index_t out_h,
+                   const index_t out_w,
+                   const index_t in_size,
+                   const index_t out_size,
+                   const index_t in_b_size,
+                   const index_t out_b_size,
+                   utils::ThreadPool *thrd_pool)
+      : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
+        out_channels(out_c), out_height(out_h), out_width(out_w),
+        in_image_size(in_size), out_image_size(out_size),
+        in_batch_size(in_b_size), out_batch_size(out_b_size),
+        thread_pool(*thrd_pool) {}
+};
+
+struct DepthwiseConvComputeParam : public ConvComputeParam {
+  const int pad_top;
+  const int pad_left;
+  const index_t multiplier;
+  const index_t valid_h_start;
+  const index_t valid_h_stop;
+  const index_t valid_w_start;
+  const index_t valid_w_stop;
+  DepthwiseConvComputeParam(const index_t b,
+                            const index_t in_c,
+                            const index_t in_h,
+                            const index_t in_w,
+                            const index_t out_c,
+                            const index_t out_h,
+                            const index_t out_w,
+                            const index_t in_size,
+                            const index_t out_size,
+                            const index_t in_b_size,
+                            const index_t out_b_size,
+                            utils::ThreadPool *thrd_pool,
+                            const int pad_top_data,
+                            const int pad_left_data,
+                            const index_t multiplier_data,
+                            const index_t valid_height_start,
+                            const index_t valid_height_stop,
+                            const index_t valid_width_start,
+                            const index_t valid_width_stop)
+      : ConvComputeParam(b, in_c, in_h, in_w, out_c, out_h, out_w,
+                         in_size, out_size, in_b_size, out_b_size, thrd_pool),
+        pad_top(pad_top_data), pad_left(pad_left_data),
+        multiplier(multiplier_data),
+        valid_h_start(valid_height_start), valid_h_stop(valid_height_stop),
+        valid_w_start(valid_width_start), valid_w_stop(valid_width_stop) {}
+};

 class Conv2dBase : public delegator::Conv2d {
 public:
-  explicit Conv2dBase(const delegator::Conv2dParam &param)
-      : delegator::Conv2d(param) {}
+  explicit Conv2dBase(const delegator::Conv2dParam &param, int type_size)
+      : delegator::Conv2d(param), type_size_(type_size) {}

  virtual ~Conv2dBase() = default;

@@ -72,11 +141,19 @@ class Conv2dBase : public delegator::Conv2d {
                const int pad_left,
                Tensor *dst);
  void UnPadOutput(const Tensor &src, Tensor *dst);
+
+  ConvComputeParam PreWorkAndGetConv2DParam(
+      const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor);
+  DepthwiseConvComputeParam PreWorkAndGetDepthwiseConv2DParam(
+      const OpContext *context, const Tensor *input,
+      const Tensor *filter, Tensor *output);
+
+ private:
+  int type_size_;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_CONV_2D_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_H_
--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,36 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/arm/fp32/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/delegator/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_1x1.h"
+
+#include <vector>

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Conv2dK1x1 : public Conv2dBase {
- public:
-  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param),
-        gemm_(delegator::GemmParam()) {}
-  virtual ~Conv2dK1x1() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-
- private:
-  Gemm gemm_;
-};

-MaceStatus Conv2dK1x1::Compute(const OpContext *context,
-                               const Tensor *input,
-                               const Tensor *filter,
-                               Tensor *output) {
+template<typename T>
+MaceStatus Conv2dK1x1<T>::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
  index_t batch = input->dim(0);
  index_t in_height = input->dim(2);
  index_t in_width = input->dim(3);
@@ -50,13 +33,8 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
  std::vector<index_t> output_shape;
  std::vector<int> in_pad_size;
  std::vector<int> out_pad_size;
-  CalOutputShapeAndPadSize(input,
-                           filter,
-                           1,
-                           1,
-                           &output_shape,
-                           &in_pad_size,
-                           &out_pad_size);
+  CalOutputShapeAndPadSize(input, filter, 1, 1,
+                           &output_shape, &in_pad_size, &out_pad_size);
  MACE_RETURN_IF_ERROR(output->Resize(output_shape));

  const index_t out_channels = output_shape[1];
@@ -70,16 +48,16 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
      in_height != padded_in_height || in_width != padded_in_width;
  auto scratch_buffer = context->device()->scratch_buffer();
  const index_t padded_in_size = is_in_padded ? PadAlignSize(
-      sizeof(float) * batch * in_channels * padded_in_height
+      sizeof(T) * batch * in_channels * padded_in_height
          * padded_in_width) : 0;
  const index_t pack_filter_size =
-      PadAlignSize(sizeof(float) * out_channels * in_channels);
+      PadAlignSize(sizeof(T) * out_channels * in_channels);
  const index_t pack_input_size =
      PadAlignSize(
-          sizeof(float) * in_channels * padded_in_height * padded_in_width);
+          sizeof(T) * in_channels * padded_in_height * padded_in_width);
  const index_t pack_output_size =
      PadAlignSize(
-          sizeof(float) * out_channels * padded_in_height * padded_in_width);
+          sizeof(T) * out_channels * padded_in_height * padded_in_width);

  const index_t gemm_pack_size =
      pack_filter_size + pack_input_size + pack_output_size;
@@ -115,12 +93,11 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,

 void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK1x1, delegator::Conv2dParam,
+      registry, Conv2dK1x1<float>, delegator::Conv2dParam,
      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
                            float, ImplType::NEON, K1x1));
 }

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/base/conv_2d_1x1.h
+++ b/mace/ops/arm/base/conv_2d_1x1.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
+
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/ops/arm/base/gemm.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dK1x1 : public Conv2dBase {
+ public:
+  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param, sizeof(T)),
+        gemm_(delegator::GemmParam()) {}
+  virtual ~Conv2dK1x1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+
+ private:
+  Gemm<T> gemm_;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
--- a/mace/ops/arm/base/conv_2d_1xn.cc
+++ b/mace/ops/arm/base/conv_2d_1xn.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_1xn.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK1x7S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K1x7S1));
+
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x1S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x1S1));
+
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK1x15S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K1x15S1));
+
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK15x1S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K15x1S1));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,76 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_1XN_H_

 #include <vector>

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class Conv2dK1x7S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK1x7S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK1x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK1x7S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-class Conv2dK7x1S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x1S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 4, 1) {}
  virtual ~Conv2dK7x1S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-class Conv2dK1x15S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK1x15S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK1x15S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK1x15S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-class Conv2dK15x1S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK15x1S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK15x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 4, 1) {}
  virtual ~Conv2dK15x1S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
--- a/mace/ops/arm/base/conv_2d_3x3.cc
+++ b/mace/ops/arm/base/conv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK3x3S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK3x3S2<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,50 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_3X3_H_

 #include <vector>

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class Conv2dK3x3S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK3x3S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK3x3S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 2, 4) {}
  virtual ~Conv2dK3x3S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-class Conv2dK3x3S2 : public Conv2dBase {
+template<typename T>
+class Conv2dK3x3S2 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK3x3S2(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK3x3S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
--- a/mace/ops/arm/base/conv_2d_5x5.cc
+++ b/mace/ops/arm/base/conv_2d_5x5.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_5x5.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK5x5S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K5x5S1));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/conv_2d_5x5.h
+++ b/mace/ops/arm/base/conv_2d_5x5.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
+
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dK5x5S1 : public Conv2dKMxN<T> {
+ public:
+  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
+      : Conv2dKMxN<T>(param, 1, 4) {}
+  virtual ~Conv2dK5x5S1() {}
+
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
--- a/mace/ops/arm/base/conv_2d_7x7.cc
+++ b/mace/ops/arm/base/conv_2d_7x7.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_7x7.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S2<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S2));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S3<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S3));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,63 +12,55 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_7X7_H_

 #include <vector>

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class Conv2dK7x7S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK7x7S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-class Conv2dK7x7S2 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S2 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x7S2(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK7x7S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-class Conv2dK7x7S3 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S3 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x7S3(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK7x7S3() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
--- a/mace/ops/arm/base/conv_2d_general.cc
+++ b/mace/ops/arm/base/conv_2d_general.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_general.h"
+
+#include <memory>
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+MaceStatus Conv2dGeneral<T>::Compute(const OpContext *context,
+                                     const Tensor *input,
+                                     const Tensor *filter,
+                                     Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context, input, filter, output, 1, 4,
+                       &padded_input, &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+
+  const T *filter_data = filter->data<T>();
+  const T *input_data = in_tensor->data<T>();
+  T *output_data = out_tensor->mutable_data<T>();
+
+  const ConvComputeParam p =
+      PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
+  auto &filter_shape = filter->shape();
+
+  DoCompute(p, filter_data, input_data, output_data, filter_shape);
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dGeneral<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/conv_2d_general.h
+++ b/mace/ops/arm/base/conv_2d_general.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
+
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  explicit Conv2dGeneral(const delegator::Conv2dParam &param)
+      : Conv2dBase(param, sizeof(T)) {}
+  virtual ~Conv2dGeneral() {}
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) override;
+
+ protected:
+  MaceStatus DoCompute(
+      const ConvComputeParam &p, const T *filter_data,
+      const T *input_data, T *output_data,
+      const std::vector<index_t> &filter_shape);
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
--- a/mace/ops/arm/base/conv_2d_mxn.h
+++ b/mace/ops/arm/base/conv_2d_mxn.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dKMxN : public Conv2dBase {
+ public:
+  explicit Conv2dKMxN(const delegator::Conv2dParam &param,
+                      const int tile_h, const int tile_w)
+      : Conv2dBase(param, sizeof(T)),
+        out_tile_h_(tile_h), out_tile_w_(tile_w) {}
+
+  virtual ~Conv2dKMxN() {}
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) override {
+    std::unique_ptr<const Tensor> padded_input;
+    std::unique_ptr<Tensor> padded_output;
+    ResizeOutAndPadInOut(context, input, filter, output, out_tile_h_,
+                         out_tile_w_, &padded_input, &padded_output);
+    const Tensor *in_tensor = input;
+    if (padded_input != nullptr) {
+      in_tensor = padded_input.get();
+    }
+    Tensor *out_tensor = output;
+    if (padded_output != nullptr) {
+      out_tensor = padded_output.get();
+    }
+    out_tensor->Clear();
+
+    Tensor::MappingGuard in_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard out_guard(output);
+
+    const T *filter_data = filter->data<T>();
+    const T *input_data = in_tensor->data<T>();
+    T *output_data = out_tensor->mutable_data<T>();
+
+    const ConvComputeParam p =
+        PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
+
+    DoCompute(p, filter_data, input_data, output_data);
+
+    UnPadOutput(*out_tensor, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                               const T *input_data, T *output_data) = 0;
+
+ private:
+  const int out_tile_h_;
+  const int out_tile_w_;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
--- a/mace/ops/arm/fp32/deconv_2d.cc
+++ b/mace/ops/arm/fp32/deconv_2d.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d.h"

-#include <utility>
 #include <functional>
-#include "mace/utils/memory.h"
+#include <utility>
+
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/utils/memory.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

 MaceStatus Deconv2dBase::ResizeOutAndPadOut(
    const OpContext *context,
@@ -67,7 +67,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
        std::accumulate(padded_out_shape.begin(),
                        padded_out_shape.end(),
                        1,
-                        std::multiplies<index_t>()) * sizeof(float);
+                        std::multiplies<index_t>()) * type_size_;
    ScratchBuffer *scratch = context->device()->scratch_buffer();
    scratch->Rewind();
    index_t scratch_size = PadAlignSize(padded_out_size);
@@ -75,7 +75,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(

    std::unique_ptr<Tensor>
        padded_out
-        (make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
+        (make_unique<Tensor>(scratch->Scratch(scratch_size), output->dtype()));
    padded_out->Reshape(padded_out_shape);
    *padded_output = std::move(padded_out);
  }
@@ -97,24 +97,97 @@ void Deconv2dBase::UnPadOutput(const Tensor &src,
  const index_t padded_height = src.dim(2);
  const index_t padded_width = src.dim(3);

-  auto padded_out_data = src.data<float>();
-  auto out_data = dst->mutable_data<float>();
+  auto padded_out_data = src.data<uint8_t>();
+  auto out_data = dst->mutable_data<uint8_t>();

  for (index_t i = 0; i < batch; ++i) {
    for (index_t j = 0; j < channels; ++j) {
      for (index_t k = 0; k < height; ++k) {
-        const float *input_base =
+        const uint8_t *input_base =
            padded_out_data + ((i * channels + j) * padded_height
-                + (k + pad_h)) * padded_width;
-        float *output_base =
-            out_data + ((i * channels + j) * height + k) * width;
-        memcpy(output_base, input_base + pad_w, width * sizeof(float));
+                + (k + pad_h)) * padded_width * type_size_;
+        uint8_t *output_base =
+            out_data + ((i * channels + j) * height + k) * width * type_size_;
+        memcpy(output_base,
+               input_base + pad_w * type_size_,
+               width * type_size_);
      }
    }
  }
 }

-}  // namespace fp32
+DeconvComputeParam Deconv2dBase::PreWorkAndGetDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return DeconvComputeParam(batch, inch, h, w, outch, outh, outw,
+                            out_img_size, &thread_pool);
+}
+
+DepthwiseDeconvComputeParam Deconv2dBase::PreWorkAndGetDepthwiseDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return DepthwiseDeconvComputeParam(batch, channels, h, w, in_img_size,
+                                     outh, outw, out_img_size, &thread_pool);
+}
+
+GroupDeconvComputeParam Deconv2dBase::PreWorkAndGetGroupDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group_;
+  const index_t outch_g = outch / group_;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return GroupDeconvComputeParam(batch, inch, h, w, outch, outh, outw,
+                                 in_img_size, out_img_size, inch_g,
+                                 outch_g, &thread_pool);
+}
+
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/base/deconv_2d.h
+++ b/mace/ops/arm/base/deconv_2d.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/ops/arm/base/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/deconv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+struct DeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t out_img_size;
+
+  utils::ThreadPool &thread_pool;
+
+  DeconvComputeParam(const index_t b,
+                     const index_t in_c,
+                     const index_t in_h,
+                     const index_t in_w,
+                     const index_t out_c,
+                     const index_t out_h,
+                     const index_t out_w,
+                     const index_t out_size,
+                     utils::ThreadPool *thrd_pool)
+      : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
+        out_channels(out_c), out_height(out_h), out_width(out_w),
+        out_img_size(out_size), thread_pool(*thrd_pool) {}
+};
+
+struct DepthwiseDeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t in_img_size;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t out_img_size;
+  utils::ThreadPool &thread_pool;
+
+  DepthwiseDeconvComputeParam(const index_t b,
+                              const index_t in_c,
+                              const index_t in_h,
+                              const index_t in_w,
+                              const index_t in_size,
+                              const index_t out_h,
+                              const index_t out_w,
+                              const index_t out_size,
+                              utils::ThreadPool *thrd_pool)
+      : batch(b),
+        in_channels(in_c),
+        in_height(in_h),
+        in_width(in_w),
+        in_img_size(in_size),
+        out_height(out_h),
+        out_width(out_w),
+        out_img_size(out_size),
+        thread_pool(*thrd_pool) {}
+};
+
+struct GroupDeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+
+  const index_t in_img_size;
+  const index_t out_img_size;
+
+  const index_t inch_g;
+  const index_t outch_g;
+  utils::ThreadPool &thread_pool;
+
+  GroupDeconvComputeParam(const index_t in_b,
+                          const index_t in_ch,
+                          const index_t in_h,
+                          const index_t in_w,
+                          const index_t out_ch,
+                          const index_t out_h,
+                          const index_t out_w,
+                          const index_t in_size,
+                          const index_t out_size,
+                          const index_t in_ch_g,
+                          const index_t out_ch_g,
+                          utils::ThreadPool *thrd_pool)
+      : batch(in_b),
+        in_channels(in_ch),
+        in_height(in_h),
+        in_width(in_w),
+        out_channels(out_ch),
+        out_height(out_h),
+        out_width(out_w),
+        in_img_size(in_size),
+        out_img_size(out_size),
+        inch_g(in_ch_g),
+        outch_g(out_ch_g),
+        thread_pool(*thrd_pool) {}
+};
+
+class Deconv2dBase : public delegator::Deconv2d {
+ public:
+  explicit Deconv2dBase(const delegator::Deconv2dParam &param, int type_size)
+      : delegator::Deconv2d(param),
+        group_(param.group_), type_size_(type_size) {}
+
+  virtual ~Deconv2dBase() = default;
+
+ protected:
+  MaceStatus ResizeOutAndPadOut(const OpContext *context,
+                                const Tensor *input,
+                                const Tensor *filter,
+                                const Tensor *output_shape,
+                                Tensor *output,
+                                std::vector<int> *out_pad_size,
+                                std::unique_ptr<Tensor> *padded_output);
+
+  void UnPadOutput(const Tensor &src,
+                   const std::vector<int> &out_pad_size,
+                   Tensor *dst);
+
+  DeconvComputeParam PreWorkAndGetDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+  DepthwiseDeconvComputeParam PreWorkAndGetDepthwiseDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+  GroupDeconvComputeParam PreWorkAndGetGroupDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+
+ protected:
+  index_t group_;
+
+ private:
+  int type_size_;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_H_
--- a/mace/ops/arm/base/deconv_2d_2x2.cc
+++ b/mace/ops/arm/base/deconv_2d_2x2.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/deconv_2d_2x2.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK2x2S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K2x2S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK2x2S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K2x2S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_2x2.h
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_

 #include <vector>
 #include <memory>
@@ -21,46 +21,38 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class Deconv2dK2x2S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK2x2S1 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK2x2S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK2x2S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class Deconv2dK2x2S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK2x2S2 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK2x2S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK2x2S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
--- a/mace/ops/arm/base/deconv_2d_3x3.cc
+++ b/mace/ops/arm/base/deconv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/deconv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK3x3S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK3x3S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_

 #include <vector>
 #include <memory>
@@ -21,46 +21,38 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class Deconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK3x3S1 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK3x3S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK3x3S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class Deconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK3x3S2 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK3x3S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK3x3S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
--- a/mace/ops/arm/base/deconv_2d_4x4.cc
+++ b/mace/ops/arm/base/deconv_2d_4x4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/deconv_2d_4x4.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK4x4S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK4x4S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,55 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_

-#include <vector>
 #include <memory>
+#include <vector>

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class Deconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK4x4S1 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK4x4S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK4x4S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class Deconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK4x4S2 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK4x4S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK4x4S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
--- a/mace/ops/arm/fp32/deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/deconv_2d_general.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,34 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_general.h"

-// TODO(liutuo): optimize it
+#include <memory>
+#include <vector>

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Deconv2dGeneral : public Deconv2dBase {
- public:
-  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
-  virtual ~Deconv2dGeneral() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
-};
-
-MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
-                                    const Tensor *input,
-                                    const Tensor *filter,
-                                    const Tensor *output_shape,
-                                    Tensor *output) {
+
+template<typename T>
+MaceStatus Deconv2dGeneral<T>::Compute(const OpContext *context,
+                                       const Tensor *input,
+                                       const Tensor *filter,
+                                       const Tensor *output_shape,
+                                       Tensor *output) {
  std::unique_ptr<Tensor> padded_out;
  std::vector<int> out_pad_size;
  ResizeOutAndPadOut(context,
@@ -60,9 +47,9 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
  Tensor::MappingGuard filter_mapper(filter);
  Tensor::MappingGuard output_mapper(output);

-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  auto input_data = input->data<T>();
+  auto filter_data = filter->data<T>();
+  auto padded_out_data = out_tensor->mutable_data<T>();

  auto &in_shape = input->shape();
  auto &out_shape = out_tensor->shape();
@@ -95,7 +82,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
                            index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t oc = start1; oc < end1; oc += step1) {
-        float *out_base =
+        T *out_base =
            padded_out_data + (b * out_channels + oc) * out_img_size;
        for (index_t i = 0; i < in_height; ++i) {
          for (index_t j = 0; j < in_width; ++j) {
@@ -104,7 +91,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
            for (int ic = 0; ic < in_channels; ++ic) {
              const index_t input_idx =
                  (b * in_channels + ic) * in_img_size + i * in_width + j;
-              const float val = input_data[input_idx];
+              const T val = input_data[input_idx];
              const index_t kernel_offset =
                  (oc * in_channels + ic) * kernel_size;
              for (int k = 0; k < kernel_size; ++k) {
@@ -126,11 +113,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,

 void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dGeneral, delegator::Deconv2dParam,
+      registry, Deconv2dGeneral<float>, delegator::Deconv2dParam,
      MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON));
 }

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/arm/base/deconv_2d_general.h
+++ b/mace/ops/arm/base/deconv_2d_general.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+
+#include "mace/ops/arm/base/deconv_2d.h"
+
+// TODO(liutuo): optimize it
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Deconv2dGeneral : public Deconv2dBase {
+ public:
+  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~Deconv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+
--- a/mace/ops/arm/base/deconv_2d_mxn.h
+++ b/mace/ops/arm/base/deconv_2d_mxn.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/deconv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Deconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit Deconv2dKMxN(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~Deconv2dKMxN() {}
+
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input,
+                     const Tensor *filter,
+                     const Tensor *output_shape,
+                     Tensor *output) {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    ResizeOutAndPadOut(context, input, filter, output_shape,
+                       output, &out_pad_size, &padded_out);
+
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+    out_tensor->Clear();
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+
+    const T *input_data = input->data<T>();
+    const T *filter_data = filter->data<T>();
+    T *padded_out_data = out_tensor->mutable_data<T>();
+
+    const DeconvComputeParam p =
+        PreWorkAndGetDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                               const T *input_data, T *padded_out_data) = 0;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
--- a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S1<float>, delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S2<float>, delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,51 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_

 #include <vector>

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/depthwise_conv_2d_mxn.h"
 #include "mace/ops/delegator/depthwise_conv_2d.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class DepthwiseConv2dK3x3S1 : public Conv2dBase {
+template<typename T>
+class DepthwiseConv2dK3x3S1 : public DepthwiseConv2dKMxN<T> {
 public:
  explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam &param)
-      : Conv2dBase(param) {}
+      : DepthwiseConv2dKMxN<T>(param) {}
  virtual ~DepthwiseConv2dK3x3S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(
+      const DepthwiseConvComputeParam &p, const T *filter,
+      const T *input_data, T *output_data) override;
 };

-class DepthwiseConv2dK3x3S2 : public Conv2dBase {
+template<typename T>
+class DepthwiseConv2dK3x3S2 : public DepthwiseConv2dKMxN<T> {
 public:
  explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam &param)
-      : Conv2dBase(param) {}
+      : DepthwiseConv2dKMxN<T>(param) {}
  virtual ~DepthwiseConv2dK3x3S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(
+      const DepthwiseConvComputeParam &p, const T *filter,
+      const T *input_data, T *output_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
--- a/mace/ops/arm/fp32/deconv_2d.h
+++ b/mace/ops/arm/fp32/deconv_2d.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,51 +12,53 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_

 #include <vector>
-#include <memory>

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/ops/delegator/deconv_2d.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/ops/delegator/depthwise_conv_2d.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class Deconv2dBase : public delegator::Deconv2d {
+template<typename T>
+class DepthwiseConv2dKMxN : public Conv2dBase {
 public:
-  explicit Deconv2dBase(const delegator::Deconv2dParam &param)
-      : delegator::Deconv2d(param),
-        group_(param.group_) {}
+  explicit DepthwiseConv2dKMxN(const delegator::DepthwiseConv2dParam &param)
+      : Conv2dBase(param, sizeof(T)) {}
+  virtual ~DepthwiseConv2dKMxN() {}

-  virtual ~Deconv2dBase() = default;
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) {
+    DepthwiseConvComputeParam p =
+        PreWorkAndGetDepthwiseConv2DParam(context, input, filter, output);
+
+    Tensor::MappingGuard in_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard out_guard(output);
+    const T *filter_data = filter->data<T>();
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+    DoCompute(p, filter_data, input_data, output_data);
+
+    return MaceStatus::MACE_SUCCESS;
+  }

 protected:
-  MaceStatus ResizeOutAndPadOut(const OpContext *context,
-                                const Tensor *input,
-                                const Tensor *filter,
-                                const Tensor *output_shape,
-                                Tensor *output,
-                                std::vector<int> *out_pad_size,
-                                std::unique_ptr<Tensor> *padded_output);
-
-  void UnPadOutput(const Tensor &src,
-                   const std::vector<int> &out_pad_size,
-                   Tensor *dst);
-  index_t group_;
+  virtual MaceStatus DoCompute(
+      const DepthwiseConvComputeParam &p, const T *filter,
+      const T *input_data, T *output_data) = 0;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
--- a/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc
+++ b/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK3x3S1<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK3x3S2<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK3x3S1<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK3x3S2<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_

 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,70 +29,56 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK3x3S1 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK3x3S1(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK3x3S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK3x3S2 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK3x3S2(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK3x3S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class GroupDeconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK3x3S1 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK3x3S1(
      const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK3x3S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class GroupDeconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK3x3S2 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK3x3S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
--- a/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc
+++ b/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK4x4S1<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK4x4S2<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+
+void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK4x4S1<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK4x4S2<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_

 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,69 +29,55 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK4x4S1 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK4x4S1(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK4x4S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK4x4S2 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK4x4S2(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK4x4S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class GroupDeconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK4x4S1 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK4x4S1() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-class GroupDeconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK4x4S2 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK4x4S2() {}

-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_general.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

-MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
-                                             const Tensor *input,
-                                             const Tensor *filter,
-                                             const Tensor *output_shape,
-                                             Tensor *output) {
+template<typename T>
+MaceStatus DepthwiseDeconv2dGeneral<T>::Compute(const OpContext *context,
+                                                const Tensor *input,
+                                                const Tensor *filter,
+                                                const Tensor *output_shape,
+                                                Tensor *output) {
  std::unique_ptr<Tensor> padded_out;
  std::vector<int> out_pad_size;
  group_ = input->dim(1);
@@ -46,9 +46,9 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
  Tensor::MappingGuard filter_mapper(filter);
  Tensor::MappingGuard output_mapper(output);

-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  const T *input_data = input->data<T>();
+  const T *filter_data = filter->data<T>();
+  T *padded_out_data = out_tensor->mutable_data<T>();

  auto &in_shape = input->shape();
  auto &out_shape = out_tensor->shape();
@@ -79,7 +79,7 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
                            index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t c = start1; c < end1; c += step1) {
-        float *out_base =
+        T *out_base =
            padded_out_data + (b * channels + c) * out_img_size;
        for (index_t i = 0; i < in_height; ++i) {
          for (index_t j = 0; j < in_width; ++j) {
@@ -105,11 +105,12 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }

-MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
-                                         const Tensor *input,
-                                         const Tensor *filter,
-                                         const Tensor *output_shape,
-                                         Tensor *output) {
+template<typename T>
+MaceStatus GroupDeconv2dGeneral<T>::Compute(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *output_shape,
+                                            Tensor *output) {
  std::unique_ptr<Tensor> padded_out;
  std::vector<int> out_pad_size;
  ResizeOutAndPadOut(context,
@@ -131,9 +132,9 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
  Tensor::MappingGuard filter_mapper(filter);
  Tensor::MappingGuard output_mapper(output);

-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  const T *input_data = input->data<T>();
+  const T *filter_data = filter->data<T>();
+  T *padded_out_data = out_tensor->mutable_data<T>();

  auto &in_shape = input->shape();
  auto &out_shape = out_tensor->shape();
@@ -209,19 +210,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,

 void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
+      registry, DepthwiseDeconv2dGeneral<float>,
+      delegator::DepthwiseDeconv2dParam,
      MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
                         float, ImplType::NEON));
 }

 void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
+      registry, GroupDeconv2dGeneral<float>, delegator::GroupDeconv2dParam,
      MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU,
                         float, ImplType::NEON));
 }

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_

 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,13 +29,13 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

+template<typename T>
 class DepthwiseDeconv2dGeneral : public Deconv2dBase {
 public:
  explicit DepthwiseDeconv2dGeneral(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dBase(param, sizeof(T)) {}
  virtual ~DepthwiseDeconv2dGeneral() {}

  MaceStatus Compute(
@@ -46,10 +46,11 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase {
      Tensor *output) override;
 };

+template<typename T>
 class GroupDeconv2dGeneral : public Deconv2dBase {
 public:
  explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dBase(param, sizeof(T)) {}
  virtual ~GroupDeconv2dGeneral() {}

  MaceStatus Compute(
@@ -60,9 +61,8 @@ class GroupDeconv2dGeneral : public Deconv2dBase {
      Tensor *output) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
--- a/mace/ops/arm/base/depthwise_deconv_2d_mxn.h
+++ b/mace/ops/arm/base/depthwise_deconv_2d_mxn.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/ops/arm/base/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class DepthwiseDeconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit DepthwiseDeconv2dKMxN(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~DepthwiseDeconv2dKMxN() {}
+
+  MaceStatus Compute(
+      const OpContext *context, const Tensor *input, const Tensor *filter,
+      const Tensor *output_shape, Tensor *output) override {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    group_ = input->dim(1);
+    ResizeOutAndPadOut(context,
+                       input,
+                       filter,
+                       output_shape,
+                       output,
+                       &out_pad_size,
+                       &padded_out);
+
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+
+    out_tensor->Clear();
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+
+    const T *input_data = input->data<float>();
+    const T *filter_data = filter->data<float>();
+    T *padded_out_data = out_tensor->mutable_data<float>();
+
+    DepthwiseDeconvComputeParam p =
+        PreWorkAndGetDepthwiseDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(
+      const DepthwiseDeconvComputeParam &p, const T *filter,
+      const T *input_data, T *padded_out_data) = 0;
+};
+
+template<typename T>
+class GroupDeconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit GroupDeconv2dKMxN(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~GroupDeconv2dKMxN() {}
+
+  MaceStatus Compute(
+      const OpContext *context, const Tensor *input, const Tensor *filter,
+      const Tensor *output_shape, Tensor *output) override {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    ResizeOutAndPadOut(context,
+                       input,
+                       filter,
+                       output_shape,
+                       output,
+                       &out_pad_size,
+                       &padded_out);
+
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+
+    out_tensor->Clear();
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto padded_out_data = out_tensor->mutable_data<float>();
+
+    GroupDeconvComputeParam p =
+        PreWorkAndGetGroupDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(
+      const GroupDeconvComputeParam &p, const T *filter,
+      const T *input_data, T *padded_out_data) = 0;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
--- a/mace/ops/arm/base/gemm.cc
+++ b/mace/ops/arm/base/gemm.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/gemm.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Gemm<float>, delegator::GemmParam,
+      MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_GEMM_H_
-#define MACE_OPS_ARM_FP32_GEMM_H_
+#ifndef MACE_OPS_ARM_BASE_GEMM_H_
+#define MACE_OPS_ARM_BASE_GEMM_H_

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
@@ -28,8 +28,10 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

+enum { kNoCache, kCacheLhs, kCacheRhs };
+
+template<typename T>
 class Gemm : public delegator::Gemm {
 public:
  explicit Gemm(const delegator::GemmParam &param)
@@ -68,26 +70,49 @@ class Gemm : public delegator::Gemm {
      const bool transpose_out,
      const bool lhs_batched,
      const bool rhs_batched,
-      Tensor *output) override;
+      Tensor *output) override {
+    index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
+    index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
+    index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
+    index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows;
+    MACE_CHECK(depth == depth2,
+               "Matrices that multiply have inconsistent depth dim: ",
+               depth,
+               " vs. ",
+               depth2);
+
+    return Compute(context,
+                   lhs,
+                   rhs,
+                   batch,
+                   rows,
+                   cols,
+                   depth,
+                   transpose_lhs ? ColMajor : RowMajor,
+                   transpose_rhs ? ColMajor : RowMajor,
+                   transpose_out ? ColMajor : RowMajor,
+                   lhs_batched,
+                   rhs_batched,
+                   output);
+  }

- private:
-  void ComputeBlock(const float *packed_lhs_data,
-                    const float *packed_rhs_data,
+ protected:
+  void ComputeBlock(const T *packed_lhs_data,
+                    const T *packed_rhs_data,
                    const index_t depth_padded,
-                    float *packed_output_data);
-
-  void PackLhs(const MatrixMap<const float> &lhs,
-               float *packed_lhs);
+                    T *packed_output_data);

-  void PackRhs(const MatrixMap<const float> &rhs,
-               float *packed_rhs);
+  void PackLhs(const MatrixMap<const T> &lhs,
+               T *packed_lhs);

-  void UnpackOutput(const float *packed_output,
-                    MatrixMap<float> *output);
+  void PackRhs(const MatrixMap<const T> &rhs,
+               T *packed_rhs);

+  void UnpackOutput(const T *packed_output,
+                    MatrixMap<T> *output);
  template<int RowBlockSize, int ColBlockSize>
-  void Unpack(const float *packed_output,
-              MatrixMap<float> *output) {
+  void Unpack(const T *packed_output,
+              MatrixMap<T> *output) {
    const index_t rows = output->rows();
    const index_t cols = output->cols();
    for (index_t r = 0; r < rows; ++r) {
@@ -98,9 +123,9 @@ class Gemm : public delegator::Gemm {
  }

  template<int WidthBlockSize, int DepthBlockSize>
-  void Pack(const MatrixMap<const float> &matrix,
+  void Pack(const MatrixMap<const T> &matrix,
            MatrixMajor dst_major,
-            float *packed_matrix) {
+            T *packed_matrix) {
    const index_t rows = matrix.rows();
    const index_t cols = matrix.cols();
    index_t depth = cols;
@@ -109,7 +134,7 @@ class Gemm : public delegator::Gemm {
      depth = rows;
    }
    const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
-    memset(packed_matrix, 0, sizeof(float) * WidthBlockSize * depth_padded);
+    memset(packed_matrix, 0, sizeof(T) * WidthBlockSize * depth_padded);
    if (dst_major == ColMajor) {
      for (index_t c = 0; c < cols; ++c) {
        for (index_t r = 0; r < rows; ++r) {
@@ -125,31 +150,14 @@ class Gemm : public delegator::Gemm {
    }
  }

+ private:
  Buffer pack_cache_;
-
  bool should_cache_pack_;
  int cached_;
 };

-template<>
-void Gemm::Pack<4, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix);
-
-template<>
-void Gemm::Pack<8, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix);
-
-template<>
-void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap<float> *output);
-
-template<>
-void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap<float> *output);
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_GEMM_H_
+#endif  // MACE_OPS_ARM_BASE_GEMM_H_
--- a/mace/ops/arm/base/gemv.cc
+++ b/mace/ops/arm/base/gemv.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "mace/ops/arm/base/gemv.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Gemv<float>, DelegatorParam,
+      MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP32_GEMV_H_
-#define MACE_OPS_ARM_FP32_GEMV_H_
+#ifndef MACE_OPS_ARM_BASE_GEMV_H_
+#define MACE_OPS_ARM_BASE_GEMV_H_

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
@@ -23,8 +23,8 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

+template<typename T>
 class Gemv : public delegator::Gemv {
 public:
  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
@@ -43,9 +43,8 @@ class Gemv : public delegator::Gemv {
      Tensor *output) override;
 };

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP32_GEMV_H_
+#endif  // MACE_OPS_ARM_BASE_GEMV_H_
--- a/mace/ops/arm/fp32/activation.cc
+++ b/mace/ops/arm/fp32/activation.cc
@@ -12,186 +12,139 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/delegator/activation.h"
-
 #include <arm_neon.h>
 #include <algorithm>

+#include "mace/ops/arm/base/activation.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Activation : public delegator::Activation {
- public:
-  explicit Activation(const delegator::ActivationParam &param)
-      : delegator::Activation(param) {}
-  ~Activation() = default;
-
-  MaceStatus Compute(const OpContext *context,
-                     const Tensor *input, Tensor *output) override;
-
- private:
-  void DoActivation(const OpContext *context,
-                    const Tensor *input, Tensor *output);
-};
-
-MaceStatus Activation::Compute(const OpContext *context,
-                               const Tensor *input, Tensor *output) {
-  Tensor::MappingGuard input_guard(input);
-  if (input != output) {
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    Tensor::MappingGuard output_guard(output);
-    DoActivation(context, input, output);
-  } else {
-    DoActivation(context, input, output);
+
+template<>
+void Activation<float>::ActivateRelu(utils::ThreadPool *thread_pool,
+                                     const float *input_data,
+                                     const index_t input_size,
+                                     float *output_data) {
+  const float32x4_t vzero = vdupq_n_f32(0.f);
+  const index_t block_count = input_size / 4;
+
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        auto input_ptr = input_data + start * 4;
+        auto output_ptr = output_data + start * 4;
+
+        for (index_t i = start; i < end; i += step) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vmaxq_f32(v, vzero);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+      },
+      0, block_count, 1);
+
+  // remain
+  for (index_t i = block_count * 4; i < input_size; ++i) {
+    output_data[i] = std::max(0.f, input_data[i]);
  }
+}

-  return MaceStatus::MACE_SUCCESS;
+template<>
+void Activation<float>::ActivateRelux(utils::ThreadPool *thread_pool,
+                                      const float *input_data,
+                                      const index_t input_size,
+                                      float *output_data) {
+  const float32x4_t vzero = vdupq_n_f32(0.f);
+  const float32x4_t vlimit = vdupq_n_f32(limit_);
+  const index_t block_count = input_size / 4;
+
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        auto input_ptr = input_data + start * 4;
+        auto output_ptr = output_data + start * 4;
+
+        for (index_t i = start; i < end; i += step) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vmaxq_f32(v, vzero);
+          v = vminq_f32(v, vlimit);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+      },
+      0, block_count, 1);
+
+  // remain
+  for (index_t i = block_count * 4; i < input_size; ++i) {
+    output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
+  }
 }

-void Activation::DoActivation(const OpContext *context,
-                              const Tensor *input,
-                              Tensor *output) {
-  auto input_data = input->data<float>();
-  auto output_data = output->mutable_data<float>();
-  const index_t size = input->size();
-
-  utils::ThreadPool &thread_pool =
-      context->device()->cpu_runtime()->thread_pool();
-
-  switch (type_) {
-    case RELU: {
-      const float32x4_t vzero = vdupq_n_f32(0.f);
-      const index_t block_count = size / 4;
-
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            auto input_ptr = input_data + start * 4;
-            auto output_ptr = output_data + start * 4;
-
-            for (index_t i = start; i < end; i += step) {
-              float32x4_t v = vld1q_f32(input_ptr);
-              v = vmaxq_f32(v, vzero);
-              vst1q_f32(output_ptr, v);
-
-              input_ptr += 4;
-              output_ptr += 4;
-            }
-          },
-          0, block_count, 1);
-
-      // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
-        output_data[i] = std::max(0.f, input_data[i]);
-      }
-
-      break;
-    }
-
-    case RELUX: {
-      const float32x4_t vzero = vdupq_n_f32(0.f);
-      const float32x4_t vlimit = vdupq_n_f32(limit_);
-      const index_t block_count = size / 4;
-
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            auto input_ptr = input_data + start * 4;
-            auto output_ptr = output_data + start * 4;
-
-            for (index_t i = start; i < end; i += step) {
-              float32x4_t v = vld1q_f32(input_ptr);
-              v = vmaxq_f32(v, vzero);
-              v = vminq_f32(v, vlimit);
-              vst1q_f32(output_ptr, v);
-
-              input_ptr += 4;
-              output_ptr += 4;
-            }
-          },
-          0, block_count, 1);
-
-      // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
-        output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
-      }
-
-      break;
-    }
-
-    case LEAKYRELU: {
-      const float32x4_t vzero = vdupq_n_f32(0.f);
-      const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
-      const index_t block_count = size / 4;
-
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            auto input_ptr = input_data + start * 4;
-            auto output_ptr = output_data + start * 4;
-
-            for (index_t i = start; i < end; i += step) {
-              float32x4_t v = vld1q_f32(input_ptr);
-              float32x4_t u = vminq_f32(v, vzero);
-              v = vmaxq_f32(v, vzero);
-              v = vmlaq_f32(v, valpha, u);
-              vst1q_f32(output_ptr, v);
-
-              input_ptr += 4;
-              output_ptr += 4;
-            }
-          },
-          0, block_count, 1);
-
-      // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
-        output_data[i] = std::max(input_data[i], 0.f) +
-            std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
-      }
-
-      break;
-    }
-
-    case TANH: {
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            for (index_t i = start; i < end; i += step) {
-              output_data[i] = std::tanh(input_data[i]);
-            }
-          },
-          0, size, 1);
-
-      break;
-    }
-
-    case SIGMOID: {
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            for (index_t i = start; i < end; i += step) {
-              output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
-            }
-          },
-          0, size, 1);
-
-      break;
-    }
-
-    case NOOP: {
-      break;
-    }
-
-    default: {
-      MACE_NOT_IMPLEMENTED;
-    }
+template<>
+void Activation<float>::ActivateLeakyRelu(utils::ThreadPool *thread_pool,
+                                          const float *input_data,
+                                          const index_t input_size,
+                                          float *output_data) {
+  const float32x4_t vzero = vdupq_n_f32(0.f);
+  const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
+  const index_t block_count = input_size / 4;
+
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        auto input_ptr = input_data + start * 4;
+        auto output_ptr = output_data + start * 4;
+
+        for (index_t i = start; i < end; i += step) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          float32x4_t u = vminq_f32(v, vzero);
+          v = vmaxq_f32(v, vzero);
+          v = vmlaq_f32(v, valpha, u);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+      },
+      0, block_count, 1);
+
+  // remain
+  for (index_t i = block_count * 4; i < input_size; ++i) {
+    output_data[i] = std::max(input_data[i], 0.f) +
+        std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
  }
 }

-void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Activation, delegator::ActivationParam,
-      MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
+template<>
+void Activation<float>::ActivateTanh(utils::ThreadPool *thread_pool,
+                                     const float *input_data,
+                                     const index_t input_size,
+                                     float *output_data) {
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        for (index_t i = start; i < end; i += step) {
+          output_data[i] = std::tanh(input_data[i]);
+        }
+      },
+      0, input_size, 1);
+}
+
+template<>
+void Activation<float>::ActivateSigmoid(utils::ThreadPool *thread_pool,
+                                        const float *input_data,
+                                        const index_t input_size,
+                                        float *output_data) {
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        for (index_t i = start; i < end; i += step) {
+          output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
+        }
+      },
+      0, input_size, 1);
 }

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/bias_add.cc
+++ b/mace/ops/arm/fp32/bias_add.cc
@@ -13,129 +13,81 @@
 // limitations under the License.

 #include <arm_neon.h>
-#include "mace/ops/delegator/bias_add.h"
+
+#include "mace/ops/arm/base/bias_add.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class BiasAdd : public delegator::BiasAdd {
- public:
-  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
-  ~BiasAdd() = default;
-
-  MaceStatus Compute(const OpContext *context, const Tensor *input,
-                     const Tensor *bias, Tensor *output) override;

- private:
-  void AddBias(const OpContext *context, const Tensor *input,
-               const Tensor *bias, Tensor *output);
-};
-
-MaceStatus BiasAdd::Compute(const OpContext *context,
-                            const Tensor *input,
-                            const Tensor *bias,
-                            Tensor *output) {
-  Tensor::MappingGuard input_guard(input);
-  Tensor::MappingGuard bias_guard(bias);
-  if (input != output) {
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    if (bias == nullptr) {
-      output->Copy(*input);
-    } else {
-      Tensor::MappingGuard output_guard(output);
-      AddBias(context, input, bias, output);
-    }
-  } else {
-    if (bias != nullptr) {
-      AddBias(context, input, bias, output);
+template<>
+void BiasAdd<float>::Add1DimBias(
+    utils::ThreadPool *thread_pool, const float *input_data,
+    const float *bias_data, float *output_data, const index_t batch,
+    const index_t channels, const index_t image_size) {
+  const index_t block_count = image_size / 4;
+  const index_t remain = image_size % 4;
+  thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
+                             index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      const index_t b_offset = b * channels;
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = (b_offset + c) * image_size;
+        auto input_ptr = input_data + offset;
+        auto output_ptr = output_data + offset;
+        const float bias = bias_data[c];
+        float32x4_t vbias = vdupq_n_f32(bias);
+
+        for (index_t i = 0; i < block_count; ++i) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vaddq_f32(v, vbias);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+        for (index_t i = 0; i < remain; ++i) {
+          (*output_ptr++) = (*input_ptr++) + bias;
+        }
+      }
    }
-  }
-
-  return MaceStatus::MACE_SUCCESS;
+  }, 0, batch, 1, 0, channels, 1);
 }

-void BiasAdd::AddBias(const OpContext *context,
-                      const Tensor *input,
-                      const Tensor *bias,
-                      mace::Tensor *output) {
-  auto input_data = input->data<float>();
-  auto bias_data = bias->data<float>();
-  auto output_data = output->mutable_data<float>();
-
-  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t image_size = height * width;
+template<>
+void BiasAdd<float>::Add2DimsBias(
+    utils::ThreadPool *thread_pool, const float *input_data,
+    const float *bias_data, float *output_data, const index_t batch,
+    const index_t channels, const index_t image_size) {
  const index_t block_count = image_size / 4;
  const index_t remain = image_size % 4;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  if (bias->dim_size() == 1) {
-    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                              index_t start1, index_t end1, index_t step1) {
-      for (index_t b = start0; b < end0; b += step0) {
-        const index_t b_offset = b * channels;
-        for (index_t c = start1; c < end1; c += step1) {
-          const index_t offset = (b_offset + c) * image_size;
-          auto input_ptr = input_data + offset;
-          auto output_ptr = output_data + offset;
-          const float bias = bias_data[c];
-          float32x4_t vbias = vdupq_n_f32(bias);
-
-          for (index_t i = 0; i < block_count; ++i) {
-            float32x4_t v = vld1q_f32(input_ptr);
-            v = vaddq_f32(v, vbias);
-            vst1q_f32(output_ptr, v);
-
-            input_ptr += 4;
-            output_ptr += 4;
-          }
-          for (index_t i = 0; i < remain; ++i) {
-            (*output_ptr++) = (*input_ptr++) + bias;
-          }
+  thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
+                             index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      const index_t b_offset = b * channels;
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = (b_offset + c) * image_size;
+        auto input_ptr = input_data + offset;
+        auto output_ptr = output_data + offset;
+        const float bias = bias_data[b * channels + c];
+        float32x4_t vbias = vdupq_n_f32(bias);
+
+        for (index_t i = 0; i < block_count; ++i) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vaddq_f32(v, vbias);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
        }
-      }
-    }, 0, batch, 1, 0, channels, 1);
-  } else {
-    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                              index_t start1, index_t end1, index_t step1) {
-      for (index_t b = start0; b < end0; b += step0) {
-        const index_t b_offset = b * channels;
-        for (index_t c = start1; c < end1; c += step1) {
-          const index_t offset = (b_offset + c) * image_size;
-          auto input_ptr = input_data + offset;
-          auto output_ptr = output_data + offset;
-          const float bias = bias_data[b * channels + c];
-          float32x4_t vbias = vdupq_n_f32(bias);
-
-          for (index_t i = 0; i < block_count; ++i) {
-            float32x4_t v = vld1q_f32(input_ptr);
-            v = vaddq_f32(v, vbias);
-            vst1q_f32(output_ptr, v);
-
-            input_ptr += 4;
-            output_ptr += 4;
-          }
-          for (index_t i = 0; i < remain; ++i) {
-            (*output_ptr++) = (*input_ptr++) + bias;
-          }
+        for (index_t i = 0; i < remain; ++i) {
+          (*output_ptr++) = (*input_ptr++) + bias;
        }
      }
-    }, 0, batch, 1, 0, channels, 1);
-  }
-}
-
-void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, BiasAdd, DelegatorParam,
-      MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
+    }
+  }, 0, batch, 1, 0, channels, 1);
 }

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/common_neon.h
+++ b/mace/ops/arm/fp32/common_neon.h
@@ -21,7 +21,6 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {

 inline float32x4_t neon_vfma_lane_0(float32x4_t a,
                          float32x4_t b,
@@ -63,7 +62,6 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a,
 #endif
 }

-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/arm/fp32/conv_2d_1xn.cc
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
--- a/mace/ops/arm/fp32/conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -18,8 +18,8 @@

 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/conv_2d.h"
-#include "mace/utils/memory.h"
 #include "mace/utils/math.h"
+#include "mace/utils/memory.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -20,8 +20,8 @@

 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/ops/arm/base/gemm.h"
 #include "mace/public/mace.h"

 namespace mace {
@@ -32,7 +32,7 @@ namespace fp32 {
 class Conv2dK3x3Winograd : public Conv2dBase {
 public:
  explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param)
-      : Conv2dBase(param),
+      : Conv2dBase(param, sizeof(float)),
        gemm_(delegator::GemmParam()),
        transformed_filter_(nullptr),
        out_tile_size_(0) {}
@@ -94,7 +94,7 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                          index_t tile_count,
                          float *output);

-  Gemm gemm_;
+  Gemm<float> gemm_;
  std::unique_ptr<Tensor> transformed_filter_;
  index_t out_tile_size_;
 };

--- a/mace/ops/arm/fp32/conv_2d_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -15,26 +15,12 @@
 #include <arm_neon.h>
 #include <memory>

-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_5x5.h"
 #include "mace/ops/delegator/conv_2d.h"

 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Conv2dK5x5S1 : public Conv2dBase {
- public:
-  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
-  virtual ~Conv2dK5x5S1() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};

 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
  /* load filter (4 outch x 1 height x 4 width) */        \
@@ -91,89 +77,43 @@ class Conv2dK5x5S1 : public Conv2dBase {
  vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
  vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);

-MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Conv2dK5x5S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
          float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
          float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;

-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
            const float
-                *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
+                *filter_ptr0 = filter_data + m * p.in_channels * 25 + c * 25;
            const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 25 + c * 25;
+                filter_data + (m + 1) * p.in_channels * 25 + c * 25;
            const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 25 + c * 25;
+                filter_data + (m + 2) * p.in_channels * 25 + c * 25;
            const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 25 + c * 25;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                filter_data + (m + 3) * p.in_channels * 25 + c * 25;
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                // output (4 outch x 1 height x 4 width): vo_outch_height
                float32x4_t vo0, vo1, vo2, vo3;
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                vo0 = vld1q_f32(out_ptr0_base + out_offset);
                vo1 = vld1q_f32(out_ptr1_base + out_offset);
                vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -190,7 +130,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,

                  MACE_Conv2dNeonK5x5SnLoadCalc4;

-                  in_offset += in_width;
+                  in_offset += p.in_width;
                  filter_ptr0 += 5;
                  filter_ptr1 += 5;
                  filter_ptr2 += 5;
@@ -210,22 +150,22 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
            }    // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 25 + c * 25;
+              for (index_t h = 0; h < p.out_height; ++h) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                  // output (1 outch x 1 height x 4 width): vo_outch_height
                  float32x4_t vo0;
                  // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
                  for (index_t r = 0; r < 5; ++r) {
                    // input (3 slide)
@@ -239,7 +179,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,

                    MACE_Conv2dNeonK5x5SnLoadCalc1;

-                    in_offset += in_width;
+                    in_offset += p.in_width;
                    filter_ptr0 += 5;
                  }  // r

@@ -252,20 +192,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
        }      // if
      }        // m
    }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);

-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }

-void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK5x5S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K5x5S1));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_general.cc
--- a/mace/ops/arm/fp32/deconv_2d_2x2.cc
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc
--- a/mace/ops/arm/fp32/deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc
--- a/mace/ops/arm/fp32/deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
--- a/mace/ops/arm/fp32/gemm.cc
+++ b/mace/ops/arm/fp32/gemm.cc
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-
-#include "mace/ops/arm/fp32/gemv.h"
-
 #include <arm_neon.h>
 #include <algorithm>

+#include "mace/ops/arm/base/gemv.h"
 #include "mace/utils/math.h"

 #if !defined(__aarch64__)
@@ -34,18 +32,18 @@ float vaddvq_f32(float32x4_t v) {
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus Gemv::Compute(const OpContext *context,
-                         const Tensor *lhs,
-                         const Tensor *rhs,
-                         const Tensor *bias,
-                         const index_t batch,
-                         const index_t lhs_height,
-                         const index_t lhs_width,
-                         const bool lhs_batched,
-                         const bool rhs_batched,
-                         Tensor *output) {
+
+template<>
+MaceStatus Gemv<float>::Compute(const OpContext *context,
+                                const Tensor *lhs,
+                                const Tensor *rhs,
+                                const Tensor *bias,
+                                const index_t batch,
+                                const index_t lhs_height,
+                                const index_t lhs_width,
+                                const bool lhs_batched,
+                                const bool rhs_batched,
+                                Tensor *output) {
  MACE_UNUSED(context);

  MACE_CHECK(output->size() == batch * lhs_height,
@@ -378,13 +376,6 @@ MaceStatus Gemv::Compute(const OpContext *context,
 #undef vaddvq_f32
 #endif

-void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Gemv, DelegatorParam,
-      MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/registry/op_delegators_registry.cc
+++ b/mace/ops/registry/op_delegators_registry.cc